From a9b65137b4ab90845c1357724d5ddaa805972830 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Tue, 9 Feb 2016 19:30:27 -0500 Subject: [PATCH 01/32] fix to validate_lang.pl regarding disambiguation symbols, and associated changes to how language-model disambiguation symbols are handled to make it easier to add more of them. --- egs/wsj/s5/utils/format_lm_sri.sh | 12 +-- egs/wsj/s5/utils/lang/add_lex_disambig.pl | 1 + egs/wsj/s5/utils/lang/check_g_properties.pl | 89 ++++++++++++++++ egs/wsj/s5/utils/lang/prepare_lang.sh | 1 + egs/wsj/s5/utils/lang/validate_lang.pl | 1 + egs/wsj/s5/utils/prepare_lang.sh | 63 +++++++----- egs/wsj/s5/utils/validate_lang.pl | 108 +++++++++++++++----- src/fstbin/fstaddselfloops.cc | 11 +- tools/extras/install_irstlm.sh | 2 +- 9 files changed, 225 insertions(+), 63 deletions(-) create mode 120000 egs/wsj/s5/utils/lang/add_lex_disambig.pl create mode 100755 egs/wsj/s5/utils/lang/check_g_properties.pl create mode 120000 egs/wsj/s5/utils/lang/prepare_lang.sh create mode 120000 egs/wsj/s5/utils/lang/validate_lang.pl diff --git a/egs/wsj/s5/utils/format_lm_sri.sh b/egs/wsj/s5/utils/format_lm_sri.sh index 7753c186045..7b5477e958a 100755 --- a/egs/wsj/s5/utils/format_lm_sri.sh +++ b/egs/wsj/s5/utils/format_lm_sri.sh @@ -71,8 +71,8 @@ if [ -z $loc ]; then export PATH=$PATH:$sdir:$sdir/.. else echo You appear to not have SRILM tools installed, either on your path, - echo or installed in $sdir. See tools/install_srilm.sh for installation - echo instructions. + echo or installed in $sdir. cd to ../../../tools and run + echo extras/install_srilm.sh. exit 1 fi fi @@ -88,8 +88,8 @@ lm_base=$(basename $lm '.gz') gunzip -c $lm | utils/find_arpa_oovs.pl $out_dir/words.txt \ > $out_dir/oovs_${lm_base}.txt || exit 1; -# Removing all "illegal" combinations of and , which are supposed to -# occur only at being/end of utt. These can cause determinization failures +# Removing all "illegal" combinations of and , which are supposed to +# occur only at being/end of utt. These can cause determinization failures # of CLG [ends up being epsilon cycles]. gunzip -c $lm \ | egrep -v ' | | ' \ @@ -98,8 +98,8 @@ gunzip -c $lm \ awk '{print $1}' $out_dir/words.txt > $tmpdir/voc || exit 1; # Change the LM vocabulary to be the intersection of the current LM vocabulary -# and the set of words in the pronunciation lexicon. This also renormalizes the -# LM by recomputing the backoff weights, and remove those ngrams whose +# and the set of words in the pronunciation lexicon. This also renormalizes the +# LM by recomputing the backoff weights, and remove those ngrams whose # probabilities are lower than the backed-off estimates. change-lm-vocab -vocab $tmpdir/voc -lm $tmpdir/lm.gz -write-lm $tmpdir/out_lm \ $srilm_opts || exit 1; diff --git a/egs/wsj/s5/utils/lang/add_lex_disambig.pl b/egs/wsj/s5/utils/lang/add_lex_disambig.pl new file mode 120000 index 00000000000..2d1d4425b49 --- /dev/null +++ b/egs/wsj/s5/utils/lang/add_lex_disambig.pl @@ -0,0 +1 @@ +../add_lex_disambig.pl \ No newline at end of file diff --git a/egs/wsj/s5/utils/lang/check_g_properties.pl b/egs/wsj/s5/utils/lang/check_g_properties.pl new file mode 100755 index 00000000000..aa0e6eb1c78 --- /dev/null +++ b/egs/wsj/s5/utils/lang/check_g_properties.pl @@ -0,0 +1,89 @@ +#!/usr/bin/env perl + +use IPC::Open2; + +if (@ARGV != 1) { + print "Usage: $0 [options] \n"; + print "e.g.: $0 data/lang\n"; + exit(1); +} + +$lang = shift @ARGV; + +# This script checks that G.fst in the lang.fst directory is OK with respect +# to certain expected properties, and returns nonzero exit status if a problem was +# detected. It is called from validate_lang.pl. +# This only checks the properties of G that relate to disambiguation symbols, +# epsilons and forbidden symbols and . + +if (! -e "$lang/G.fst") { + print "$0: error: $lang/G.fst does not exist\n"; + exit(1); +} + +open(W, "<$lang/words.txt") || die "opening $lang/words.txt"; +$hash_zero = -1; +while () { + @A = split(" ", $_); + ($sym, $int) = @A; + if ($sym eq "" || $sym eq "") { $is_forbidden{$int} = 1; } + if ($sym eq "#0") { $hash_zero = $int; } +} + +if (-e "$lang/phones/wdisambig_words.int") { + open(F, "<$lang/phones/wdisambig_words.int") || die "opening $lang/phones/wdisambig_words.int"; + while () { + chop; + $is_disambig{$_} = 1; + } +} else { + $is_disambig{$hash_zero} = 1; +} + +$input_cmd = ". ./path.sh; fstprint $lang/G.fst|"; +open(G, $input_cmd) || die "running command $input_cmd"; + +$info_cmd = ". ./path.sh; fstcompile | fstinfo "; +open2(O, I, "$info_cmd") || die "running command $info_cmd"; + +$has_epsilons = 0; + +while () { + @A = split(" ", $_); + if (@A >= 4) { + if ($is_forbidden{$A[2]} || $is_forbidden{$A[3]}) { + chop; + print "$0: validating $lang: error: line $_ in G.fst contains forbidden symbol or \n"; + exit(1); + } elsif ($is_disambig{$A[2]}) { + print O $_; + if ($A[3] != 0) { + chop; + print "$0: validating $lang: error: line $_ in G.fst has disambig on input but no epsilon on output\n"; + exit(1); + } + } elsif ($A[2] == 0) { + print O $_; + $has_epsilons = 1; + } elsif ($A[2] != $A[3]) { + chop; + print "$0: validating $lang: error: line $_ in G.fst has inputs and outputs different but input is not disambig symbol.\n"; + exit(1); + } + } +} + +close(O); # tell 'fstcompile | fstinfo' pipeline that its input is done. +while () { + if (m/cyclic\s+/) { + print "$0: validating $lang: error: G.fst has cycles containing only disambig symbols and epsilons. Would cause determinization failure\n"; + exit(1); + } +} + +if ($has_epsilons) { + print "$0: warning: validating $lang: G.fst has epsilon-input arcs. We don't expect these in most setups.\n"; +} + +print "--> $0 successfully validated $lang/G.fst\n"; +exit(0); diff --git a/egs/wsj/s5/utils/lang/prepare_lang.sh b/egs/wsj/s5/utils/lang/prepare_lang.sh new file mode 120000 index 00000000000..96b9f592e82 --- /dev/null +++ b/egs/wsj/s5/utils/lang/prepare_lang.sh @@ -0,0 +1 @@ +../prepare_lang.sh \ No newline at end of file diff --git a/egs/wsj/s5/utils/lang/validate_lang.pl b/egs/wsj/s5/utils/lang/validate_lang.pl new file mode 120000 index 00000000000..edb66bf3149 --- /dev/null +++ b/egs/wsj/s5/utils/lang/validate_lang.pl @@ -0,0 +1 @@ +../validate_lang.pl \ No newline at end of file diff --git a/egs/wsj/s5/utils/prepare_lang.sh b/egs/wsj/s5/utils/prepare_lang.sh index 43b8bce1f4c..c8888dbcb8a 100755 --- a/egs/wsj/s5/utils/prepare_lang.sh +++ b/egs/wsj/s5/utils/prepare_lang.sh @@ -28,20 +28,21 @@ # and also files silence_phones.txt, nonsilence_phones.txt, optional_silence.txt # and extra_questions.txt # Here, silence_phones.txt and nonsilence_phones.txt are lists of silence and -# non-silence phones respectively (where silence includes various kinds of -# noise, laugh, cough, filled pauses etc., and nonsilence phones includes the +# non-silence phones respectively (where silence includes various kinds of +# noise, laugh, cough, filled pauses etc., and nonsilence phones includes the # "real" phones.) -# In each line of those files is a list of phones, and the phones on each line -# are assumed to correspond to the same "base phone", i.e. they will be +# In each line of those files is a list of phones, and the phones on each line +# are assumed to correspond to the same "base phone", i.e. they will be # different stress or tone variations of the same basic phone. -# The file "optional_silence.txt" contains just a single phone (typically SIL) +# The file "optional_silence.txt" contains just a single phone (typically SIL) # which is used for optional silence in the lexicon. # extra_questions.txt might be empty; typically will consist of lists of phones, -# all members of each list with the same stress or tone; and also possibly a -# list for the silence phones. This will augment the automtically generated -# questions (note: the automatically generated ones will treat all the -# stress/tone versions of a phone the same, so will not "get to ask" about +# all members of each list with the same stress or tone; and also possibly a +# list for the silence phones. This will augment the automatically generated +# questions (note: the automatically generated ones will treat all the +# stress/tone versions of a phone the same, so will not "get to ask" about # stress or tone). +# # This script adds word-position-dependent phones and constructs a host of other # derived files, that go in data/lang/. @@ -49,19 +50,20 @@ # Begin configuration section. num_sil_states=5 num_nonsil_states=3 +num_word_disambig_syms=1 position_dependent_phones=true -# position_dependent_phones is false also when position dependent phones and word_boundary.txt +# position_dependent_phones is false also when position dependent phones and word_boundary.txt # have been generated by another source reverse=false -share_silence_phones=false # if true, then share pdfs of different silence +share_silence_phones=false # if true, then share pdfs of different silence # phones together. sil_prob=0.5 phone_symbol_table= # if set, use a specified phones.txt file. # end configuration sections -. utils/parse_options.sh +. utils/parse_options.sh -if [ $# -ne 4 ]; then +if [ $# -ne 4 ]; then echo "usage: utils/prepare_lang.sh " echo "e.g.: utils/prepare_lang.sh data/local/dict data/local/lang data/lang" echo " should contain the following files:" @@ -133,10 +135,10 @@ if $position_dependent_phones; then # adding the markers _B, _E, _S, _I depending on word position. # In this recipe, these markers apply to silence also. # Do this starting from lexiconp.txt only. - if "$silprob"; then + if "$silprob"; then perl -ane '@A=split(" ",$_); $w = shift @A; $p = shift @A; $silword_p = shift @A; $wordsil_f = shift @A; $wordnonsil_f = shift @A; @A>0||die; - if(@A==1) { print "$w $p $silword_p $wordsil_f $wordnonsil_f $A[0]_S\n"; } + if(@A==1) { print "$w $p $silword_p $wordsil_f $wordnonsil_f $A[0]_S\n"; } else { print "$w $p $silword_p $wordsil_f $wordnonsil_f $A[0]_B "; for($n=1;$n<@A-1;$n++) { print "$A[$n]_I "; } print "$A[$n]_E\n"; } ' \ < $srcdir/lexiconp_silprob.txt > $tmpdir/lexiconp_silprob.txt @@ -158,11 +160,11 @@ if $position_dependent_phones; then mv $tmpdir/lexiconp.pre_reverse $tmpdir/lexiconp.txt fi fi - + # create $tmpdir/phone_map.txt # this has the format (on each line) # ... - # where the versions depend on the position of the phone within a word. + # where the versions depend on the position of the phone within a word. # For instance, we'd have: # AA AA_B AA_E AA_I AA_S # for (B)egin, (E)nd, (I)nternal and (S)ingleton @@ -178,7 +180,7 @@ if $position_dependent_phones; then <(for x in `cat $srcdir/nonsilence_phones.txt`; do for y in "" "_B" "_E" "_I" "_S"; do echo -n "$x$y "; done; echo; done) \ > $tmpdir/phone_map.txt else - if "$silprob"; then + if "$silprob"; then cp $srcdir/lexiconp_silprob.txt $tmpdir/lexiconp_silprob.txt if $reverse; then echo "We do not support reverse option and silprob at the same time" @@ -277,7 +279,7 @@ if [[ ! -z $phone_symbol_table ]]; then start_symbol=`grep \#0 $phone_symbol_table | awk '{print $2}'` echo "" | cat - $dir/phones/{silence,nonsilence}.txt | awk -v f=$phone_symbol_table ' BEGIN { while ((getline < f) > 0) { phones[$1] = $2; }} { print $1" "phones[$1]; }' | sort -k2 -g |\ - cat - <(cat $dir/phones/disambig.txt | awk -v x=$start_symbol '{n=x+NR-1; print $1, n;}') > $dir/phones.txt + cat - <(cat $dir/phones/disambig.txt | awk -v x=$start_symbol '{n=x+NR-1; print $1, n;}') > $dir/phones.txt else echo "" | cat - $dir/phones/{silence,nonsilence,disambig}.txt | \ awk '{n=NR-1; print $1, n;}' > $dir/phones.txt @@ -313,7 +315,7 @@ fi cat $tmpdir/lexiconp.txt | awk '{print $1}' | sort | uniq | awk ' BEGIN { print " 0"; - } + } { if ($1 == "") { print " is in the vocabulary!" | "cat 1>&2" @@ -362,7 +364,7 @@ cat $dir/phones/align_lexicon.txt | utils/sym2int.pl -f 3- $dir/phones.txt | \ utils/sym2int.pl -f 1-2 $dir/words.txt > $dir/phones/align_lexicon.int # Create the basic L.fst without disambiguation symbols, for use -# in training. +# in training. if $silprob; then # Usually it's the same as having a fixed-prob L.fst @@ -386,7 +388,18 @@ cat $dir/oov.txt | utils/sym2int.pl $dir/words.txt >$dir/oov.int || exit 1; # integer version of oov symbol, used in some scripts. -# Create these lists of phones in colon-separated integer list form too, +# the file wdisambig.txt contains a (line-by-line) list of the text-form of the +# disambiguation symbols that are used in the grammar and passed through by the +# lexicon. At this stage it's hardcoded as '#0', but we're laying the groundwork +# for more generality (which probably would be added by another script). +# wdisambig_words.int contains the corresponding list interpreted by the +# symbol table words.txt, and wdisambig_phones.int contains the corresponding +# list interpreted by the symbol table phones.txt. +echo '#0' >$dir/phones/wdisambig.txt +utils/sym2int.pl $dir/phones.txt <$dir/phones/wdisambig.txt >$dir/phones/wdisambig_phones.int +utils/sym2int.pl $dir/words.txt <$dir/phones/wdisambig.txt >$dir/phones/wdisambig_words.int + +# Create these lists of phones in colon-separated integer list form too, # for purposes of being given to programs as command-line options. for f in silence nonsilence optional_silence disambig context_indep; do utils/sym2int.pl $dir/phones.txt <$dir/phones/$f.txt >$dir/phones/$f.int @@ -415,20 +428,18 @@ utils/gen_topo.pl $num_nonsil_states $num_sil_states $nonsilphonelist $silphonel # Create the lexicon FST with disambiguation symbols, and put it in lang_test. # There is an extra step where we create a loop to "pass through" the # disambiguation symbols from G.fst. -phone_disambig_symbol=`grep \#0 $dir/phones.txt | awk '{print $2}'` -word_disambig_symbol=`grep \#0 $dir/words.txt | awk '{print $2}'` if $silprob; then utils/make_lexicon_fst_silprob.pl $tmpdir/lexiconp_silprob_disambig.txt $srcdir/silprob.txt $silphone '#'$ndisambig | \ fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \ --keep_isymbols=false --keep_osymbols=false | \ - fstaddselfloops "echo $phone_disambig_symbol |" "echo $word_disambig_symbol |" | \ + fstaddselfloops $dir/phones/wdisambig_phones.int $dir/phones/wdisambig_words.int | \ fstarcsort --sort_type=olabel > $dir/L_disambig.fst || exit 1; else utils/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp_disambig.txt $sil_prob $silphone '#'$ndisambig | \ fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \ --keep_isymbols=false --keep_osymbols=false | \ - fstaddselfloops "echo $phone_disambig_symbol |" "echo $word_disambig_symbol |" | \ + fstaddselfloops $dir/phones/wdisambig_phones.int $dir/phones/wdisambig_words.int | \ fstarcsort --sort_type=olabel > $dir/L_disambig.fst || exit 1; fi diff --git a/egs/wsj/s5/utils/validate_lang.pl b/egs/wsj/s5/utils/validate_lang.pl index ae087bd9578..415d06a4aaf 100755 --- a/egs/wsj/s5/utils/validate_lang.pl +++ b/egs/wsj/s5/utils/validate_lang.pl @@ -89,15 +89,7 @@ $wint2sym{$wsymtab{$_}} = $_; } } -if (exists $wsymtab{"#0"}) { - print "--> $lang/words.txt has \"#0\"\n"; - print "--> $lang/words.txt is OK\n"; -} else { - $warning = 1; - print "--> WARNING: $lang/words.txt doesn't have \"#0\"\n"; - print "--> (if you are using ARPA-type language models, you will normally\n"; - print "--> need the disambiguation symbol \"#0\" to ensure determinizability)\n"; -} +print "--> $lang/words.txt is OK\n"; print "\n"; # Checking phones/* ------------------------------- @@ -113,7 +105,6 @@ sub check_txt_int_csl { if (!open(CSL, "<$cat.csl")) { $exit = 1; return print "--> ERROR: fail to open $cat.csl\n"; } - if (-z "$cat.txt") { $warning = 1; print "--> WARNING: $cat.txt is empty\n"; } @@ -743,6 +734,77 @@ sub check_summation { } } +sub check_wdisambig { + print "Checking word-level disambiguation symbols...\n"; + # This block checks that one of the two following conditions hold: + # (1) for lang diretories prepared by older versions of prepare_lang.sh: + # The symbol '#0' should appear in words.txt and phones.txt, and should + # or (2): the files wdisambig.txt, wdisambig_phones.int and wdisambig_words.int + # exist, and have the expected properties (see below for details). + my %wdisambig_words_hash; + my %wdisambig_words_string = ""; + + if (! -e "$lang/phones/wdisambig.txt") { + print "--> no $lang/phones/wdisambig.txt (older prepare_lang.sh)\n"; + if (exists $wsymtab{"#0"}) { + print "--> $lang/words.txt has \"#0\"\n"; + $wdisambig_words_hash{$wsymtab{"#0"}} = 1; + $wdisambig_words_string = $wsymtab{"#0"}; + } else { + print "--> ERROR: $lang/words.txt doesn't have \"#0\"\n"; + print "--> (if you are using ARPA-type language models, you will normally\n"; + print "--> need the disambiguation symbol \"#0\" to ensure determinizability)\n"; + $exit = 1; + } + } else { + print "--> $lang/phones/wdisambig.txt exists (newer prepare_lang.sh)\n"; + if (!open(T, "<$lang/phones/wdisambig.txt")) { + print "--> ERROR: fail to open $lang/phones/wdisambig.txt\n"; $exit = 1; return; + } + chomp(my @wdisambig = ); + close(T); + if (!open(W, "<$lang/phones/wdisambig_words.int")) { + print "--> ERROR: fail to open $lang/phones/wdisambig_words.int\n"; $exit = 1; return; + } + chomp(my @wdisambig_words = ); + close(W); + if (!open(P, "<$lang/phones/wdisambig_phones.int")) { + print "--> ERROR: fail to open $lang/phones/wdisambig_phones.int\n"; $exit = 1; return; + } + chomp(my @wdisambig_phones =

); + close(P); + my $len = @wdisambig, $len2; + if (($len2 = @wdisambig_words) != $len) { + print "--> ERROR: files $lang/phones/wdisambig.txt and $lang/phones/wdisambig_words.int have different lengths"; + $exit = 1; return; + } + if (($len2 = @wdisambig_phones) != $len) { + print "--> ERROR: files $lang/phones/wdisambig.txt and $lang/phones/wdisambig_phones.int have different lengths"; + $exit = 1; return; + } + for (my $i = 0; $i < $len; $i++) { + if ($wsymtab{$wdisambig[$i]} ne $wdisambig_words[$i]) { + my $ii = $i + 1; + print "--> ERROR: line $ii of files $lang/phones/wdisambig.txt and $lang/phones/wdisambig_words.int mismatch\n"; + $exit = 1; return; + } + } + for (my $i = 0; $i < $len; $i++) { + if ($psymtab{$wdisambig[$i]} ne $wdisambig_phones[$i]) { + my $ii = $i + 1; + print "--> ERROR: line $ii of files $lang/phones/wdisambig.txt and $lang/phones/wdisambig_phones.int mismatch\n"; + $exit = 1; return; + } + } + foreach my $i ( @wdisambig_words ) { + $wdisambig_words_hash{$i} = 1; + $wdisambig_words_string .= " " . $i; + } + } +} + +check_wdisambig(); + if (-e "$lang/G.fst") { # Check that G.fst is ilabel sorted and nonempty. $text = `. ./path.sh; fstinfo $lang/G.fst`; @@ -781,21 +843,17 @@ sub check_summation { } # Check that G.fst does not have cycles with only disambiguation symbols or - # epsilons on the input, or the forbidden symbols and . - $cmd = ". ./path.sh; fstprint $lang/G.fst | awk -v disambig=$lang/phones/disambig.int -v words=$lang/words.txt 'BEGIN{while((getline0) is_disambig[\$1]=1; is_disambig[0] = 1; while((getline0){ if(\$1==\"\"||\$1==\"\") is_forbidden[\$2]=1;}} {if(NF<3 || is_disambig[\$3]) print; else if(is_forbidden[\$3] || is_forbidden[\$4]) { print \"Error: line \" \$0 \" in G.fst contains forbidden symbol or \" | \"cat 1>&2\"; exit(1); }}' | fstcompile | fstinfo "; - $output = `$cmd`; - if ($output !~ m/# of states\s+[1-9]/) { # fstinfo did not read a nonempty FST (there should be final probs at least)... - print "--> ERROR: failure running command to check for disambig-sym loops [possibly G.fst " . - "contained the forbidden symbols or , or possibly some other error.. Output was: \n"; - print $output; - $exit = 1; - } - if ($output !~ m/cyclic\s+n/) { # FST was cyclic after selecting only for disambig symbols. This is now allowed. - print "--> ERROR: G.fst contained cycles with only disambiguation symbols or epsilons on the input. Would cause determinization failure in graph creation.\n"; - $exit = 1; - } else { - print "--> G.fst did not contain cycles with only disambig symbols or epsilon on the input, and did not contain\n" . - "the forbidden symbols or (if present in vocab) on the input or output.\n"; + # epsilons on the input, or the forbidden symbols and (and a few + # related checks + + if (-e "$lang/G.fst") { + system("utils/lang/check_g_properties.pl $lang"); + if ($? != 0) { + print "--> ERROR: failure running check_g_properties.pl\n"; + $exit = 1; + } else { + print("--> utils/lang/check_g_properties.pl succeeded.\n"); + } } } diff --git a/src/fstbin/fstaddselfloops.cc b/src/fstbin/fstaddselfloops.cc index 9219093bee1..96895f23cf4 100644 --- a/src/fstbin/fstaddselfloops.cc +++ b/src/fstbin/fstaddselfloops.cc @@ -45,8 +45,9 @@ int main(int argc, char *argv[]) { "on at least one arc out of the state. Useful in conjunction with predeterminize\n" "\n" "Usage: fstaddselfloops in-disambig-list out-disambig-list [in.fst [out.fst] ]\n" - "E.g: fstaddselfloops in.list out.list < in.fst > withloops.fst\n"; - + "E.g: fstaddselfloops in.list out.list < in.fst > withloops.fst\n" + "in.list and out.list are lists of integers, one per line, of the\n" + "same length.\n"; ParseOptions po(usage); po.Read(argc, argv); @@ -62,12 +63,12 @@ int main(int argc, char *argv[]) { fst_out_filename = po.GetOptArg(4); VectorFst *fst = ReadFstKaldi(fst_in_filename); - + std::vector disambig_in; if (!ReadIntegerVectorSimple(disambig_in_rxfilename, &disambig_in)) KALDI_ERR << "fstaddselfloops: Could not read disambiguation symbols from " << kaldi::PrintableRxfilename(disambig_in_rxfilename); - + std::vector disambig_out; if (!ReadIntegerVectorSimple(disambig_out_rxfilename, &disambig_out)) KALDI_ERR << "fstaddselfloops: Could not read disambiguation symbols from " @@ -81,7 +82,7 @@ int main(int argc, char *argv[]) { WriteFstKaldi(*fst, fst_out_filename); delete fst; - + return 0; } catch(const std::exception &e) { std::cerr << e.what(); diff --git a/tools/extras/install_irstlm.sh b/tools/extras/install_irstlm.sh index 680e37a60b4..53ac392ac0e 100755 --- a/tools/extras/install_irstlm.sh +++ b/tools/extras/install_irstlm.sh @@ -55,7 +55,7 @@ fi [ ! -z ${IRSTLM} ] && \ echo >&2 "IRSTLM config is already in env.sh" && exit - wd=`readlink -f $wd || pwd` + wd=`readlink -f $wd 2>/dev/null || pwd` echo "export IRSTLM=$wd/irstlm" echo "export PATH=\${PATH}:\${IRSTLM}/bin" From 117c075076ff9c8b0589913d8cc8770901313ba9 Mon Sep 17 00:00:00 2001 From: vesis84 Date: Thu, 11 Feb 2016 17:39:46 +0100 Subject: [PATCH 02/32] nnet1: added the removal of frames with 0 confidence from mini-batches --- src/nnetbin/nnet-train-frmshuff.cc | 33 ++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/src/nnetbin/nnet-train-frmshuff.cc b/src/nnetbin/nnet-train-frmshuff.cc index 8cc065add4f..1d804f971c0 100644 --- a/src/nnetbin/nnet-train-frmshuff.cc +++ b/src/nnetbin/nnet-train-frmshuff.cc @@ -226,6 +226,39 @@ int main(int argc, char *argv[]) { // apply optional feature transform nnet_transf.Feedforward(CuMatrix(mat), &feats_transf); + // remove frames with '0' weight from training, + { + // are there frames to be removed? + if (!weights.Min() > 0.0) { + // create vector with frame-indices to keep, + std::vector keep_frames; + for (int32 i=0; i 0.0) + keep_frames.push_back(i); + } + if (keep_frames.size() == 0) continue; // all frames removed, skip sentence, + + // filter feature-frames, + CuMatrix tmp_feats(keep_frames.size(), feats_transf.NumCols()); + tmp_feats.CopyRows(feats_transf, CuArray(keep_frames)); + tmp_feats.Swap(&feats_transf); + + // filter targets, + Posterior tmp_targets; + for (int32 i=0; i tmp_weights(keep_frames.size()); + for (int32 i=0; i Date: Fri, 12 Feb 2016 17:16:24 -0500 Subject: [PATCH 03/32] some cosmetic improvements to slurm.pl and to the fisher_callhome_spanish recipe --- .../s5/local/fsp_data_prep.sh | 40 +++++------ egs/fisher_callhome_spanish/s5/run.sh | 22 +++--- egs/wsj/s5/utils/slurm.pl | 69 ++++++++++--------- 3 files changed, 68 insertions(+), 63 deletions(-) diff --git a/egs/fisher_callhome_spanish/s5/local/fsp_data_prep.sh b/egs/fisher_callhome_spanish/s5/local/fsp_data_prep.sh index f453ab42058..8fe80b46784 100755 --- a/egs/fisher_callhome_spanish/s5/local/fsp_data_prep.sh +++ b/egs/fisher_callhome_spanish/s5/local/fsp_data_prep.sh @@ -1,13 +1,13 @@ #!/bin/bash # # Copyright 2014 Gaurav Kumar. Apache 2.0 -# The input is the Fisher Dataset which contains DISC1 and DISC2. (*.sph files) -# In addition the transcripts are needed as well. +# The input is the Fisher Dataset which contains DISC1 and DISC2. (*.sph files) +# In addition the transcripts are needed as well. # To be run from one directory above this script. # Note: when creating your own data preparation scripts, it's a good idea # to make sure that the speaker id (if present) is a prefix of the utterance -# id, that the output scp file is sorted on utterance id, and that the +# id, that the output scp file is sorted on utterance id, and that the # transcription file is exactly the same length as the scp file and is also # sorted on utterance id (missing transcriptions should be removed from the # scp file using e.g. scripts/filter_scp.pl) @@ -18,8 +18,8 @@ export LC_ALL=C if [ $# -lt 2 ]; then - echo "Arguments should be the location of the Spanish Fisher Speech and Transcript Directories, se -e ../run.sh for example." + echo "Usage: $0 " + echo "e.g.: $0 /home/mpost/data/LDC/LDC2010S01 /home/mpost/data/LDC/LDC2010T04" exit 1; fi @@ -72,20 +72,20 @@ fi speech_d1=$dir/links/LDC2010S01/DISC1/data/speech speech_d2=$dir/links/LDC2010S01/DISC2/data/speech -transcripts=$dir/links/LDC2010T04/data/transcripts - -fcount_d1=`find ${speech_d1} -iname '*.sph' | wc -l` -fcount_d2=`find ${speech_d2} -iname '*.sph' | wc -l` -fcount_t=`find ${transcripts} -iname '*.tdf' | wc -l` -#TODO:it seems like not all speech files have transcripts +transcripts=$dir/links/LDC2010T04/data/transcripts + +fcount_d1=`find ${speech_d1} -iname '*.sph' | wc -l` +fcount_d2=`find ${speech_d2} -iname '*.sph' | wc -l` +fcount_t=`find ${transcripts} -iname '*.tdf' | wc -l` +#TODO:it seems like not all speech files have transcripts #Now check if we got all the files that we needed -if [ $fcount_d1 != 411 -o $fcount_d2 != 408 -o $fcount_t != 819 ]; -then - echo "Incorrect number of files in the data directories" - echo "DISC1 and DISC2 should contain 411 and 408 .sph files respectively" - echo "The transcripts should contain 819 files" - exit 1; -fi +if [ $fcount_d1 != 411 -o $fcount_d2 != 408 -o $fcount_t != 819 ]; +then + echo "Incorrect number of files in the data directories" + echo "DISC1 and DISC2 should contain 411 and 408 .sph files respectively" + echo "The transcripts should contain 819 files" + exit 1; +fi if [ $stage -le 0 ]; then #Gather all the speech files together to create a file list @@ -105,7 +105,7 @@ if [ $stage -le 1 ]; then mv $tmpdir/reco2file_and_channel $dir/train_all/ fi -if [ $stage -le 2 ]; then +if [ $stage -le 2 ]; then sort $tmpdir/text.1 | grep -v '((' | \ awk '{if (NF > 1){ print; }}' | \ sed 's:<\s*[/]*\s*\s*for[ei][ei]g[nh]\s*\w*>::g' | \ @@ -149,7 +149,7 @@ if [ $stage -le 3 ]; then for f in `cat $tmpdir/train_sph.flist`; do # convert to absolute path readlink -e $f - done > $tmpdir/train_sph_abs.flist + done > $tmpdir/train_sph_abs.flist cat $tmpdir/train_sph_abs.flist | perl -ane 'm:/([^/]+)\.sph$: || die "bad line $_; "; print "$1 $_"; ' > $tmpdir/sph.scp cat $tmpdir/sph.scp | awk -v sph2pipe=$sph2pipe '{printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2); printf("%s-B %s -f wav -p -c 2 %s |\n", $1, sph2pipe, $2);}' | \ diff --git a/egs/fisher_callhome_spanish/s5/run.sh b/egs/fisher_callhome_spanish/s5/run.sh index 706f3793278..edd7f56bad2 100755 --- a/egs/fisher_callhome_spanish/s5/run.sh +++ b/egs/fisher_callhome_spanish/s5/run.sh @@ -17,12 +17,10 @@ set -e sfisher_speech=/home/mpost/data/LDC/LDC2010S01 sfisher_transcripts=/home/mpost/data/LDC/LDC2010T04 spanish_lexicon=/export/corpora/LDC/LDC96L16 -#split=/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt split=local/splits/split_fisher callhome_speech=/export/corpora/LDC/LDC96S35 callhome_transcripts=/export/corpora/LDC/LDC96T17 -#split_callhome=/export/a04/gkumar/corpora/fishcall/jack-splits/split-callhome split=local/splits/split_callhome local/fsp_data_prep.sh $sfisher_speech $sfisher_transcripts @@ -33,16 +31,16 @@ local/fsp_prepare_dict.sh $spanish_lexicon # Rewrite ----------------------------- This section is no longer needed---- # At this point, it might make sense to use a bigger lexicon -# The one I will use is derived from this exercise (spanish fisher) and -# the LDC spanish lexicon along with the most frequent words derived from the +# The one I will use is derived from this exercise (spanish fisher) and +# the LDC spanish lexicon along with the most frequent words derived from the # gigaword corpus such that the total number of entries in the lexicon # are 64k # To generate the merged lexicon, run # /export/a04/gkumar/corpora/gigaword/bin/merge_lexicons.py # you might have to set the locations of the three lexicons within this -# file. Note that the LDC rule base phoneme generator works only from its -# own directory. So the merged lexicon is actually created in +# file. Note that the LDC rule base phoneme generator works only from its +# own directory. So the merged lexicon is actually created in # /export/a04/gkumar/corpora/LDC9..../spanish_lexicon../lexicon64k # This can be easily fixed and will be done. #TODO # Also run the clean lexicon script to take care of non stressable vowels @@ -57,11 +55,11 @@ utils/prepare_lang.sh data/local/dict "" data/local/lang data/lang # Make sure that you do not use your test and your dev sets to train the LM -# Some form of cross validation is possible where you decode your dev/set based on an +# Some form of cross validation is possible where you decode your dev/set based on an # LM that is trained on everything but that that conversation # When in doubt about what your data partitions should be use local/fsp_ideal_data_partitions.pl -# to get the numbers. Depending on your needs, you might have to change the size of -# the splits within that file. The default paritions are based on the Kaldi + Joshua +# to get the numbers. Depending on your needs, you might have to change the size of +# the splits within that file. The default paritions are based on the Kaldi + Joshua # requirements which means that I have very large dev and test sets local/fsp_train_lms.sh $split local/fsp_create_test_lang.sh @@ -95,7 +93,7 @@ cp -r data/local/data/callhome_train_all data/callhome_train_all # MT Tune : Same as the ASR eval set (Use the lattices from here) # MT Eval : 20k utterances # The dev and the test sets need to be carefully chosen so that there is no conversation/speaker -# overlap. This has been setup and the script local/fsp_ideal_data_partitions provides the numbers that are needed below. +# overlap. This has been setup and the script local/fsp_ideal_data_partitions provides the numbers that are needed below. # As noted above, the LM has not been trained on the dev and the test sets. #utils/subset_data_dir.sh --first data/train_all 158126 data/dev_and_test #utils/subset_data_dir.sh --first data/dev_and_test 37814 data/asr_dev_and_test @@ -136,7 +134,7 @@ utils/subset_data_dir.sh --shortest data/train 90000 data/train_100kshort utils/subset_data_dir.sh data/train_100kshort 10000 data/train_10k local/remove_dup_utts.sh 100 data/train_10k data/train_10k_nodup utils/subset_data_dir.sh --speakers data/train 30000 data/train_30k -utils/subset_data_dir.sh --speakers data/train 90000 data/train_100k +utils/subset_data_dir.sh --speakers data/train 90000 data/train_100k steps/train_mono.sh --nj 10 --cmd "$train_cmd" \ data/train_10k_nodup data/lang exp/mono0a @@ -178,7 +176,7 @@ steps/train_lda_mllt.sh --cmd "$train_cmd" \ exp/tri3a/graph data/dev exp/tri3a/decode_dev || exit 1; )& -# Next we'll use fMLLR and train with SAT (i.e. on +# Next we'll use fMLLR and train with SAT (i.e. on # fMLLR features) steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \ diff --git a/egs/wsj/s5/utils/slurm.pl b/egs/wsj/s5/utils/slurm.pl index 68c269080ac..8095272732e 100755 --- a/egs/wsj/s5/utils/slurm.pl +++ b/egs/wsj/s5/utils/slurm.pl @@ -11,7 +11,7 @@ use Cwd; use Getopt::Long; -# slurm.pl was created from the queue.pl +# slurm.pl was created from the queue.pl # queue.pl has the same functionality as run.pl, except that # it runs the job in question on the queue (Sun GridEngine). # This version of queue.pl uses the task array functionality @@ -20,7 +20,7 @@ # The script now supports configuring the queue system using a config file # (default in conf/queue.conf; but can be passed specified with --config option) -# and a set of command line options. +# and a set of command line options. # The current script handles: # 1) Normal configuration arguments # For e.g. a command line option of "--gpu 1" could be converted into the option @@ -30,7 +30,7 @@ # $0 here in the line is replaced with the argument read from the CLI and the # resulting string is passed to qsub. # 2) Special arguments to options such as -# gpu=0 +# gpu=0 # If --gpu 0 is given in the command line, then no special "-q" is given. # 3) Default argument # default gpu=0 @@ -60,7 +60,7 @@ my $qsub_opts = ""; my $sync = 0; my $num_threads = 1; -my $max_jobs_run; +my $max_jobs_run; my $gpu = 0; my $config = "conf/slurm.conf"; @@ -99,12 +99,12 @@ () print_usage(); } -for (my $x = 1; $x <= 3; $x++) { # This for-loop is to +for (my $x = 1; $x <= 3; $x++) { # This for-loop is to # allow the JOB=1:n option to be interleaved with the # options to qsub. while (@ARGV >= 2 && $ARGV[0] =~ m:^-:) { my $switch = shift @ARGV; - + if ($switch eq "-V") { $qsub_opts .= "-V "; } else { @@ -121,10 +121,10 @@ () $num_threads = $argument2; } elsif ($switch =~ m/^--/) { # Config options # Convert CLI option to variable name - # by removing '--' from the switch and replacing any + # by removing '--' from the switch and replacing any # '-' with a '_' $switch =~ s/^--//; - $switch =~ s/-/_/g; + $switch =~ s/-/_/g; $cli_options{$switch} = $argument; } else { # Other qsub options - passed as is $qsub_opts .= "$switch $argument "; @@ -160,7 +160,7 @@ () if (exists $cli_options{"config"}) { $config = $cli_options{"config"}; -} +} my $default_config_file = <<'EOF'; # Default configuration @@ -168,17 +168,18 @@ () option time=* --time $0 option mem=* --mem-per-cpu $0 option mem=0 # Do not add anything to qsub_opts -option num_threads=* --cpus-per-task $0 --ntasks-per-node=1 +option num_threads=* --cpus-per-task $0 --ntasks-per-node=1 option num_threads=1 --cpus-per-task 1 --ntasks-per-node=1 # Do not add anything to qsub_opts -option max_jobs_run=* # Do nothing default gpu=0 option gpu=0 -p shared option gpu=* -p gpu --gres=gpu:$0 --time 4:0:0 # this has to be figured out +# note: the --max-jobs-run option is supported as a special case +# by slurm.pl and you don't have to handle it in the config file. EOF # Here the configuration options specified by the user on the command line # (e.g. --mem 2G) are converted to options to the qsub system as defined in -# the config file. (e.g. if the config file has the line +# the config file. (e.g. if the config file has the line # "option mem=* -l ram_free=$0,mem_free=$0" # and the user has specified '--mem 2G' on the command line, the options # passed to queue system would be "-l ram_free=2G,mem_free=2G @@ -192,7 +193,7 @@ () my %cli_config_options = (); my %cli_default_options = (); -if ($opened_config_file == 0 && exists($cli_options{"config"})) { +if ($opened_config_file == 0 && exists($cli_options{"config"})) { print STDERR "Could not open config file $config\n"; exit(1); } elsif ($opened_config_file == 0 && !exists($cli_options{"config"})) { @@ -212,12 +213,12 @@ () if ($_ =~ /^command (.+)/) { $read_command = 1; $qsub_cmd = $1 . " "; - } elsif ($_ =~ m/^option ([^=]+)=\* (.+)$/) { + } elsif ($_ =~ m/^option ([^=]+)=\* (.+)$/) { # Config option that needs replacement with parameter value read from CLI # e.g.: option mem=* -l mem_free=$0,ram_free=$0 my $option = $1; # mem my $arg= $2; # -l mem_free=$0,ram_free=$0 - if ($arg !~ m:\$0:) { + if ($arg !~ m:\$0:) { print STDERR "Warning: the line '$line' in config file ($config) does not substitution variable \$0\n"; } if (exists $cli_options{$option}) { @@ -237,7 +238,7 @@ () } } elsif ($_ =~ m/^default (\S+)=(\S+)/) { # Default options. Used for setting default values to options i.e. when - # the user does not specify the option on the command line + # the user does not specify the option on the command line # e.g. default gpu=0 my $option = $1; # gpu my $value = $2; # 0 @@ -261,19 +262,25 @@ () for my $option (keys %cli_options) { if ($option eq "config") { next; } - if ($option eq "max_jobs_run" && $array_job != 1) { print STDERR "Ignoring $option\n"; next; } + my $value = $cli_options{$option}; - - if ($option eq "max_jobs_run") { $max_jobs_run = $value; } - if (exists $cli_default_options{($option,$value)}) { + if ($option eq "max_jobs_run") { + if ($array_job != 1) { + print STDERR "Ignoring $option since this is not an array task."; + } else { + $max_jobs_run = $value; + } + } elsif (exists $cli_default_options{($option,$value)}) { $qsub_opts .= "$cli_default_options{($option,$value)} "; } elsif (exists $cli_config_options{$option}) { $qsub_opts .= "$cli_config_options{$option} "; } elsif (exists $cli_default_options{($option,"*")}) { $qsub_opts .= $cli_default_options{($option,"*")} . " "; } else { - if ($opened_config_file == 0) { $config = "default config file"; } + if ($opened_config_file == 0) { + $config = "default config file"; + } die "$0: Command line option $option not described in $config (or value '$value' not allowed)\n"; } } @@ -301,7 +308,7 @@ () # my $cmd = ""; -foreach my $x (@ARGV) { +foreach my $x (@ARGV) { if ($x =~ m/^\S+$/) { $cmd .= $x . " "; } # If string contains no spaces, take # as-is. elsif ($x =~ m:\":) { $cmd .= "'$x' "; } # else if no dbl-quotes, use single @@ -322,23 +329,23 @@ () # make a directory called "q", # where we will put the log created by qsub... normally this doesn't contain # anything interesting, evertyhing goes to $logfile. -if (! -d "$qdir") { +if (! -d "$qdir") { system "mkdir $qdir 2>/dev/null"; sleep(5); ## This is to fix an issue we encountered in denominator lattice creation, ## where if e.g. the exp/tri2b_denlats/log/15/q directory had just been ## created and the job immediately ran, it would die with an error because nfs ## had not yet synced. I'm also decreasing the acdirmin and acdirmax in our ## NFS settings to something like 5 seconds. -} +} my $queue_array_opt = ""; if ($array_job == 1) { # It's an array job. if ($max_jobs_run) { - $queue_array_opt = "--array ${jobstart}-${jobend}%${max_jobs_run}"; + $queue_array_opt = "--array ${jobstart}-${jobend}%${max_jobs_run}"; } else { - $queue_array_opt = "--array ${jobstart}-${jobend}"; + $queue_array_opt = "--array ${jobstart}-${jobend}"; } - $logfile =~ s/$jobname/\$SLURM_ARRAY_TASK_ID/g; # This variable will get + $logfile =~ s/$jobname/\$SLURM_ARRAY_TASK_ID/g; # This variable will get # replaced by qsub, in each job, with the job-id. $cmd =~ s/$jobname/\$\{SLURM_ARRAY_TASK_ID\}/g; # same for the command... $queue_logfile =~ s/\.?$jobname//; # the log file in the q/ subdirectory @@ -475,14 +482,14 @@ () } } - # Check that the job exists in SLURM. Job can be killed if duration - # exceeds some hard limit, or in case of a machine shutdown. + # Check that the job exists in SLURM. Job can be killed if duration + # exceeds some hard limit, or in case of a machine shutdown. if (($check_sge_job_ctr++ % 10) == 0) { # Don't run qstat too often, avoid stress on SGE. if ( -f $f ) { next; }; #syncfile appeared: OK. $ret = system("squeue -j $sge_job_id >/dev/null 2>/dev/null"); # system(...) : To get the actual exit value, shift $ret right by eight bits. if ($ret>>8 == 1) { # Job does not seem to exist - # Don't consider immediately missing job as error, first wait some + # Don't consider immediately missing job as error, first wait some # time to make sure it is not just delayed creation of the syncfile. sleep(3); @@ -546,7 +553,7 @@ () push @logfiles, $logfile; } else { for (my $jobid = $jobstart; $jobid <= $jobend; $jobid++) { - my $l = $logfile; + my $l = $logfile; $l =~ s/\$SLURM_ARRAY_TASK_ID/$jobid/g; push @logfiles, $l; } From 032aa24b77655c3c7d347be82d262b0787faee57 Mon Sep 17 00:00:00 2001 From: Hossein Hadian Date: Sun, 14 Feb 2016 15:21:00 +0330 Subject: [PATCH 04/32] Minor fix regarding adaptation configs --- src/online2/online-gmm-decoding.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/online2/online-gmm-decoding.h b/src/online2/online-gmm-decoding.h index 41c9ca4c14d..8bec6cd9ab9 100644 --- a/src/online2/online-gmm-decoding.h +++ b/src/online2/online-gmm-decoding.h @@ -71,10 +71,10 @@ struct OnlineGmmDecodingAdaptationPolicyConfig { opts->Register("adaptation-first-utt-ratio", &adaptation_first_utt_ratio, "Ratio that controls frequency of fMLLR adaptation for first " "utterance of each speaker"); - opts->Register("adaptation-delay", &adaptation_first_utt_delay, + opts->Register("adaptation-delay", &adaptation_delay, "Delay before first basis-fMLLR adaptation for not-first " "utterances of each speaker"); - opts->Register("adaptation-ratio", &adaptation_first_utt_ratio, + opts->Register("adaptation-ratio", &adaptation_ratio, "Ratio that controls frequency of fMLLR adaptation for " "not-first utterances of each speaker"); } From 8aa016ad500d3242de3fa373d942b9efe9cc5fff Mon Sep 17 00:00:00 2001 From: Joshua Milas Date: Sun, 14 Feb 2016 14:14:33 -0500 Subject: [PATCH 05/32] If compiling with MSVS 2015, dont redefine snprintf --- src/base/kaldi-utils.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/base/kaldi-utils.cc b/src/base/kaldi-utils.cc index 13a3412a9bb..1ae1dc0b758 100644 --- a/src/base/kaldi-utils.cc +++ b/src/base/kaldi-utils.cc @@ -20,7 +20,9 @@ #include #elif defined(_WIN32) || defined(_MSC_VER) || defined(MINGW) #include +#if defined(_MSC_VER) && _MSC_VER < 1900 #define snprintf _snprintf +#endif /* _MSC_VER < 1900 */ #else #include #endif From 1fa0e18e715389e232e634ac38050a640344ad8b Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Sun, 14 Feb 2016 19:44:20 -0500 Subject: [PATCH 06/32] adding a couple more swbd+chain tuning experiments --- egs/swbd/s5c/local/chain/run_tdnn_5w.sh | 10 + egs/swbd/s5c/local/chain/run_tdnn_5x.sh | 476 ++++++++++++++++++++++++ 2 files changed, 486 insertions(+) create mode 100755 egs/swbd/s5c/local/chain/run_tdnn_5x.sh diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5w.sh b/egs/swbd/s5c/local/chain/run_tdnn_5w.sh index e21c3a8b04f..1a40acfa105 100755 --- a/egs/swbd/s5c/local/chain/run_tdnn_5w.sh +++ b/egs/swbd/s5c/local/chain/run_tdnn_5w.sh @@ -6,6 +6,16 @@ # 3500, add --self-repair-scale 0.00001, and reduce --jesus-forward-output-dim # from 1800 to 1700. +# Difference is tiny. +#local/chain/compare_wer.sh 5k 5w +#System 5k 5w +#WER on train_dev(tg) 16.46 16.56 +#WER on train_dev(fg) 15.17 15.30 +#WER on eval2000(tg) 18.1 18.1 +#WER on eval2000(fg) 16.5 16.4 +#Final train prob -0.105502 -0.106549 +#Final valid prob -0.12337 -0.120079 + # _5k is as _5j (omitting iVectors), and adding a statistics-extraction layer # in the middle, like 5e->5g, to see whether it recovers some of the improvement # of using the iVectors. diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5x.sh b/egs/swbd/s5c/local/chain/run_tdnn_5x.sh new file mode 100755 index 00000000000..e50dadfd963 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_5x.sh @@ -0,0 +1,476 @@ +#!/bin/bash + +# _5w is as _5x but decreasing the context of the averaging layer from +-0.99 +# seconds to +-0.66 seconds. I would not have expected this to work a priori, +# but the change from 5k -> 5l, which made the context wider, made WERs slightly +# worse, so I'd like to see what happens when we decrease the context. + +# It's worse. Odd because increasing the context (5k->5l) seemed to be a little +# worse also. +local/chain/compare_wer.sh 5w 5x +#System 5w 5x +#WER on train_dev(tg) 16.56 16.66 +#WER on train_dev(fg) 15.30 15.41 +#WER on eval2000(tg) 18.1 18.5 +#WER on eval2000(fg) 16.4 16.6 +#Final train prob -0.106549 -0.105693 +#Final valid prob -0.120079 -0.121834 + +# _5w is as _5k (which is a fairly good-performing ivector-free model), but +# making the same changes as 5e -> 5t, which makes the model more lightweight +# and faster to train, specifically: reduce --jesus-hidden-dim from 7500 to +# 3500, add --self-repair-scale 0.00001, and reduce --jesus-forward-output-dim +# from 1800 to 1700. + +# _5k is as _5j (omitting iVectors), and adding a statistics-extraction layer +# in the middle, like 5e->5g, to see whether it recovers some of the improvement +# of using the iVectors. + +# It recovers half of the improvement-- but the objf is better than +# we might expect. I think it's learning some phonetic stuff too. +# +#./compare_wer.sh 5e 5j 5k +#System 5e 5j 5k +#WER on train_dev(tg) 15.43 17.59 16.46 +#WER on train_dev(fg) 14.32 16.33 15.17 +#WER on eval2000(tg) 17.3 19.1 18.1 +#WER on eval2000(fg) 15.5 17.5 16.5 +#Final train prob -0.110056 -0.114691 -0.105502 +#Final valid prob -0.129184 -0.130761 -0.12337 + +# The following is decoding with the default frames-per-chunk of 50, and +# --extra-left-context 20. +#./compare_wer.sh 5e 5j 5k +#System 5e 5j 5k +#WER on train_dev(tg) 15.43 17.59 17.37 +#WER on train_dev(fg) 14.32 16.33 16.09 +#WER on eval2000(tg) 17.3 19.1 18.8 +#WER on eval2000(fg) 15.5 17.5 17.3 +#Final train prob -0.110056 -0.114691 -0.105502 +#Final valid prob -0.129184 -0.130761 -0.12337 + +# _5j is as _5e, but omitting the iVectors. + +# Definitely worse, although curiously, there is very little effect on the valid prob. +#./compare_wer.sh 5e 5j +#System 5e 5j +#WER on train_dev(tg) 15.43 17.59 +#WER on train_dev(fg) 14.32 16.33 +#WER on eval2000(tg) 17.3 19.1 +#WER on eval2000(fg) 15.5 17.5 +#Final train prob -0.110056 -0.114691 +#Final valid prob -0.129184 -0.130761 + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_5x # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --egs-dir exp/chain/tdnn_5w_sp/egs \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --jesus-opts "--jesus-forward-input-dim 500 --jesus-forward-output-dim 1700 --jesus-hidden-dim 3500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3,mean+stddev(-63:3:9:63) -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --frames-per-chunk 300 \ + --nj 50 --cmd "$decode_cmd" \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; From d5b8b237c5e046bfd2091c2ae65e5693d18f20f0 Mon Sep 17 00:00:00 2001 From: Sabine Crevoisier Date: Mon, 15 Feb 2016 17:19:12 +0000 Subject: [PATCH 07/32] Modified bash command to avoid wildcard expansion when using phones with *. --- egs/wsj/s5/utils/prepare_lang.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/egs/wsj/s5/utils/prepare_lang.sh b/egs/wsj/s5/utils/prepare_lang.sh index 43b8bce1f4c..e451492cc1d 100755 --- a/egs/wsj/s5/utils/prepare_lang.sh +++ b/egs/wsj/s5/utils/prepare_lang.sh @@ -174,8 +174,8 @@ if $position_dependent_phones; then # This phone map expands the phone lists into all the word-position-dependent # versions of the phone lists. - cat <(for x in `cat $srcdir/silence_phones.txt`; do for y in "" "" "_B" "_E" "_I" "_S"; do echo -n "$x$y "; done; echo; done) \ - <(for x in `cat $srcdir/nonsilence_phones.txt`; do for y in "" "_B" "_E" "_I" "_S"; do echo -n "$x$y "; done; echo; done) \ + cat <(cat $srcdir/silence_phones.txt | while read x; do for y in "" "" "_B" "_E" "_I" "_S"; do echo -n "$x$y "; done; echo; done) \ + <(cat $srcdir/nonsilence_phones.txt | while read x; do for y in "" "_B" "_E" "_I" "_S"; do echo -n "$x$y "; done; echo; done) \ > $tmpdir/phone_map.txt else if "$silprob"; then @@ -245,10 +245,10 @@ cat $srcdir/extra_questions.txt 2>/dev/null | utils/apply_map.pl $tmpdir/phone_m # be inside a word. if $position_dependent_phones; then for suffix in _B _E _I _S; do - (for x in `cat $srcdir/nonsilence_phones.txt`; do echo -n "$x$suffix "; done; echo) >>$dir/phones/extra_questions.txt + (cat $srcdir/nonsilence_phones.txt | while read x; do echo -n "$x$suffix "; done; echo) >>$dir/phones/extra_questions.txt done for suffix in "" _B _E _I _S; do - (for x in `cat $srcdir/silence_phones.txt`; do echo -n "$x$suffix "; done; echo) >>$dir/phones/extra_questions.txt + (cat $srcdir/silence_phones.txt | while read x; do echo -n "$x$suffix "; done; echo) >>$dir/phones/extra_questions.txt done fi From 2a862b47419b7b8a2737f162341c4c011a1a12fc Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Mon, 15 Feb 2016 17:16:46 -0500 Subject: [PATCH 08/32] swbd+chain: Add some new example scripts and an associated change in the config script to support skip-splicing. --- egs/swbd/s5c/local/chain/run_tdnn_5y.sh | 466 +++++++++++++++++++ egs/swbd/s5c/local/chain/run_tdnn_5z.sh | 457 ++++++++++++++++++ egs/swbd/s5c/local/chain/run_tdnn_6a.sh | 461 ++++++++++++++++++ egs/wsj/s5/steps/nnet3/make_jesus_configs.py | 28 +- 4 files changed, 1403 insertions(+), 9 deletions(-) create mode 100755 egs/swbd/s5c/local/chain/run_tdnn_5y.sh create mode 100755 egs/swbd/s5c/local/chain/run_tdnn_5z.sh create mode 100755 egs/swbd/s5c/local/chain/run_tdnn_6a.sh diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5y.sh b/egs/swbd/s5c/local/chain/run_tdnn_5y.sh new file mode 100755 index 00000000000..f89c1f5deac --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_5y.sh @@ -0,0 +1,466 @@ +#!/bin/bash + +# _5y is as _5v, but rebalancing the network to have fewer parameters in the +# final layer and more in the hidden parts, by reducing --final-hidden-dim from 500 +# (it defaults to --jesus-forward-hidden-dim) to 400, and increasing +# --jesus-forward-input-dim from 500 to 600 and +# --jesus-forward-output-dim from 1700 to 1800, +# and --jesus-hidden-dim from 2500 to 3000 (note: I don't really expect this last change +# to make much of a difference). +# Very roughly, we're moving about a million parameters from the final layer to the +# hidden parts of the network. Hopefully this will reduce overtraining, since +# the hidden parts of the network are regularized by the --xent-regularize option. + +# _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500. + +# WER is almost the same, perhaps <0.1% worse; diagnostics are slightly worse. +# +#local/chain/compare_wer.sh 5e 5s 5t 5v +#System 5e 5s 5t 5v +#WER on train_dev(tg) 15.43 15.47 15.43 15.38 +#WER on train_dev(fg) 14.32 14.31 14.34 14.39 +#WER on eval2000(tg) 17.3 17.4 17.4 17.4 +#WER on eval2000(fg) 15.5 15.6 15.6 15.7 +#Final train prob -0.110056 -0.110928 -0.110752 -0.11156 +#Final valid prob -0.129184 -0.132139 -0.129123 -0.131797 + +# _5t is as _5s but further reducing the jesus-hidden-dim (trying to speed it +# up), from 5000 to 3500. + +# about 5s: comparing with 5e which is the most recent baseline we actually +# decoded, 5s is as 5e but with jesus-forward-output-dim reduced 1800->1700, +# jesus-hidden-dim reduced 7500 to 5000, and and the new option +# --self-repair-scale 0.00001 added. Also compare 5t and 5v which have even +# smaller jesus-hidden-dims. + +# _5s is as _5r but increasing the jesus-forward-output-dim to the intermediate +# value of 1700 (between 1500 and 1800), and also a bug-fix in the self-repair +# code to a bug which was doubling the thresholds so there was, in effect, +# no upper threshold. I stopped the p,q,r runs after I found this, but in +# configuring this run I'm bearing in mind the train and valid probs from the +# p,q,r runs. + +# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000. + +# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try +# to compensate for the fact that more of the output dimensions are now being +# usefully used. + +# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair +# ReLUs that are over or under-saturated. + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_5y # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 1800 --final-hidden-dim 400 --jesus-hidden-dim 3000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5z.sh b/egs/swbd/s5c/local/chain/run_tdnn_5z.sh new file mode 100755 index 00000000000..0f3e89470d8 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_5z.sh @@ -0,0 +1,457 @@ +#!/bin/bash + +# _5z is as _5v, but adding skip-splicing (a new configuration option) + +# _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500. + +# WER is almost the same, perhaps <0.1% worse; diagnostics are slightly worse. +# +#local/chain/compare_wer.sh 5e 5s 5t 5v +#System 5e 5s 5t 5v +#WER on train_dev(tg) 15.43 15.47 15.43 15.38 +#WER on train_dev(fg) 14.32 14.31 14.34 14.39 +#WER on eval2000(tg) 17.3 17.4 17.4 17.4 +#WER on eval2000(fg) 15.5 15.6 15.6 15.7 +#Final train prob -0.110056 -0.110928 -0.110752 -0.11156 +#Final valid prob -0.129184 -0.132139 -0.129123 -0.131797 + +# _5t is as _5s but further reducing the jesus-hidden-dim (trying to speed it +# up), from 5000 to 3500. + +# about 5s: comparing with 5e which is the most recent baseline we actually +# decoded, 5s is as 5e but with jesus-forward-output-dim reduced 1800->1700, +# jesus-hidden-dim reduced 7500 to 5000, and and the new option +# --self-repair-scale 0.00001 added. Also compare 5t and 5v which have even +# smaller jesus-hidden-dims. + +# _5s is as _5r but increasing the jesus-forward-output-dim to the intermediate +# value of 1700 (between 1500 and 1800), and also a bug-fix in the self-repair +# code to a bug which was doubling the thresholds so there was, in effect, +# no upper threshold. I stopped the p,q,r runs after I found this, but in +# configuring this run I'm bearing in mind the train and valid probs from the +# p,q,r runs. + +# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000. + +# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try +# to compensate for the fact that more of the output dimensions are now being +# usefully used. + +# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair +# ReLUs that are over or under-saturated. + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_5z # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 500 --jesus-forward-output-dim 1700 --jesus-hidden-dim 2500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3,skip0 -3,0,3,skip0 -3,0,3,skip0 -6,-3,0,skip-3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6a.sh b/egs/swbd/s5c/local/chain/run_tdnn_6a.sh new file mode 100755 index 00000000000..70bd894f313 --- /dev/null +++ b/egs/swbd/s5c/local/chain/run_tdnn_6a.sh @@ -0,0 +1,461 @@ +#!/bin/bash + +# _5z is as _5z, but adding the change in configuration that +# we made from _5v to _5y (moving some parameters from final layer +# to hidden parts of network) + +# _5z is as _5v, but adding skip-splicing (a new configuration option) + +# _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500. + +# WER is almost the same, perhaps <0.1% worse; diagnostics are slightly worse. +# +#local/chain/compare_wer.sh 5e 5s 5t 5v +#System 5e 5s 5t 5v +#WER on train_dev(tg) 15.43 15.47 15.43 15.38 +#WER on train_dev(fg) 14.32 14.31 14.34 14.39 +#WER on eval2000(tg) 17.3 17.4 17.4 17.4 +#WER on eval2000(fg) 15.5 15.6 15.6 15.7 +#Final train prob -0.110056 -0.110928 -0.110752 -0.11156 +#Final valid prob -0.129184 -0.132139 -0.129123 -0.131797 + +# _5t is as _5s but further reducing the jesus-hidden-dim (trying to speed it +# up), from 5000 to 3500. + +# about 5s: comparing with 5e which is the most recent baseline we actually +# decoded, 5s is as 5e but with jesus-forward-output-dim reduced 1800->1700, +# jesus-hidden-dim reduced 7500 to 5000, and and the new option +# --self-repair-scale 0.00001 added. Also compare 5t and 5v which have even +# smaller jesus-hidden-dims. + +# _5s is as _5r but increasing the jesus-forward-output-dim to the intermediate +# value of 1700 (between 1500 and 1800), and also a bug-fix in the self-repair +# code to a bug which was doubling the thresholds so there was, in effect, +# no upper threshold. I stopped the p,q,r runs after I found this, but in +# configuring this run I'm bearing in mind the train and valid probs from the +# p,q,r runs. + +# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000. + +# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try +# to compensate for the fact that more of the output dimensions are now being +# usefully used. + +# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair +# ReLUs that are over or under-saturated. + +# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on +# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05). + +# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen +# in the train and valid probs. +#System 5b 5e +#WER on train_dev(tg) 15.51 15.43 +#WER on train_dev(fg) 14.39 14.32 +#WER on eval2000(tg) 17.3 17.3 +#WER on eval2000(fg) 15.6 15.5 +#Final train prob -0.112013 -0.110056 +#Final valid prob -0.130879 -0.129184 + +# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1. + +# It does seem helpful on average: (-0.35, -0.35, -0.1, 0). +#./compare_wer.sh 5a 5b +#System 5a 5b +#WER on train_dev(tg) 15.86 15.51 +#WER on train_dev(fg) 14.74 14.39 +#WER on eval2000(tg) 17.4 17.3 +#WER on eval2000(fg) 15.6 15.6 +#Final train prob -0.0998359 -0.112013 +#Final valid prob -0.115884 -0.130879 + +# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and +# jesus-forward-input-dim from 400 to 500. Hoping that the cross-entropy regularization +# will mean that the increased parameters are now helpful. + +# _4w is as _4v, but doubling --xent-regularize to 0.2 + +# _4v is as _4r, but with --xent-regularize 0.1. Increasing max_param_change +# from 1.0 to 2.0 because there is a lot of parameter change in the final xent +# layer, and this limits the rate of change of the other layers. + +# _4r is as _4f, but one more hidden layer, and reducing context of existing +# layers so we can re-use the egs. Reducing jesus-forward-output-dim slightly +# from 1500 to 1400. + +# This is better than 4f by almost all metrics. +# ./compare_wer.sh 4f 4r +# System 4f 4r +# WER on train_dev(tg) 16.83 16.50 +# WER on train_dev(fg) 15.73 15.45 +# WER on eval2000(tg) 18.4 18.3 +# WER on eval2000(fg) 16.6 16.7 +# Final train prob -0.105832 -0.103652 +# Final valid prob -0.123021 -0.121105 + +# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005. + +# It's even better than 4e, by about 0.3% abs. +# 4c 4e 4f +# Final valid prob: -0.1241 -0.1267 -0.1230 +# Final train prob: -0.08820 -0.1149 -0.1058 + +# ./show_wer.sh 4f +# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0 +# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 4e +# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0 +# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0 +# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys +# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys + + +# _4e is as _4c, but adding the option --l2-regularize 0.0001. + +# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000. + +# _4a is as _3s, but using narrower splice-indexes in the first layer. + +# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400. +# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p. +# This of course reduces overtraining. Results are a bit better than 3p but still +# not as good as 2y + +# ./show_wer.sh 3s +# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 3p +# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0 +# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0 +# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys +# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys +# a03:s5c: ./show_wer.sh 2y +# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + + +# _3r is as _3p but reducing the number of parameters as it seemed to be +# overtraining (despite already being quite a small model): [600,1800 -> +# 500,1500]. Also in the interim there was a script change to +# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change. +# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent +# with the halving of the minibatch size.] + + +# _3p is the same as 3o, but after a code and script change so we can use +# natural gradient for the RepeatedAffineComponent. +# [natural gradient was helpful, based on logs; +# also made a change to use positive bias for the jesus-component affine parts.] + +# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2. + +# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support +# recurrence, with improvements to the learning of the jesus layers. + +# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found +# to be worse. +# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence +# is helpful.] +#./show_wer.sh 3g +#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys +#a03:s5c: ./show_wer.sh 2y +#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0 +#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0 +#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +#a03:s5c: ./show_wer.sh 3d +#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0 +#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0 +#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys +#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys + +# _3f is as _3e, but modifying the splicing setup to add (left) recurrence: +# added the :3's in --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3" +# Therefore it's +# no longer really a tdnn, more like an RNN combined with TDNN. BTW, I'm not re-dumping egs with extra +# context, and this isn't really ideal - I want to see if this seems promising first. + +# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default) +# to 200 in order to reduce computation in the Jesus layer. + +# _3d is as _2y, and re-using the egs, but using --jesus-opts and +# configs from make_jesus_configs.py. +# --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \ +# --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3" + +# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from +# 800k to 1.2 million. The aim is to avoid some of the per-job overhead +# (model-averaging, etc.), since each iteration takes only a minute or so. +# I added the results to the table below. It seems the same on average- +# which is good. We'll probably keep this configuration. + +# _2o is as _2m, but going back to our original 2-state topology, which it turns +# out that I never tested to WER. +# hm--- it's about the same, or maybe slightly better! +# caution: accidentally overwrote most of this dir, but kept the key stuff. + +# note: when I compare with the rerun of 2o (not shown), this run is actually +# better. +# WER on 2m 2o 2y [ now comparing 2o->2y:] +# train_dev,tg 17.22 17.24 16.99 0.2% better +# train_dev,fg 15.87 15.93 15.86 0.1% better +# eval2000,tg 18.7 18.7 18.9 0.2% worse +# eval2000,fg 17.0 16.9 17.0 0.1% worse + +# train-prob,final -0.0803 -0.0835 +# valid-prob,final -0.0116 -0.0122 + +# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling +# that mechanism. + +# _2k is as _2i, but doing the same change as in _s -> _2e, in which we +# set --apply-deriv-weights false and --frames-overlap-per-eg 0. + +# _2i is as _2d but with a new set of code for estimating the LM, in which we compute +# the log-like change when deciding which states to back off. The code is not the same +# as the one in 2{f,g,h}. We have only the options --num-extra-lm-states=2000. By +# default it estimates a 4-gram, with 3-gram as the no-prune order. So the configuration +# is quite similar to 2d, except new/more-exact code is used. + +# _2d is as _2c but with different LM options: +# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000" +# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram. +# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions +# provided from the tree-building, and effectively puts the leftmost context position as a single +# set. +# This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg +# from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6. + +# _2c is as _2a but after a code change in which we start using transition-scale +# and self-loop-scale of 1 instead of zero in training; we change the options to +# mkgraph used in testing, to set the scale to 1.0. This shouldn't affect +# results at all; it's is mainly for convenience in pushing weights in graphs, +# and checking that graphs are stochastic. + +# _2a is as _z but setting --lm-opts "--num-extra-states=8000". + +# _z is as _x but setting --lm-opts "--num-extra-states=2000". +# (see also y, which has --num-extra-states=500). + +# _x is as _s but setting --lm-opts "--num-extra-states=0". +# this is a kind of repeat of the u->v experiment, where it seemed to make things +# worse, but there were other factors involved in that so I want to be sure. + +# _s is as _q but setting pdf-boundary-penalty to 0.0 +# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000, +# and 18.07 -> 16.96 on train_dev, after fg rescoring. + +# _q is as _p except making the same change as from n->o, which +# reduces the parameters to try to reduce over-training. We reduce +# relu-dim from 1024 to 850, and target num-states from 12k to 9k, +# and modify the splicing setup. +# note: I don't rerun the tree-building, I just use the '5o' treedir. + +# _p is as _m except with a code change in which we switch to a different, more +# exact mechanism to deal with the edges of the egs, and correspondingly +# different script options... we now dump weights with the egs, and apply the +# weights to the derivative w.r.t. the output instead of using the +# --min-deriv-time and --max-deriv-time options. Increased the frames-overlap +# to 30 also. This wil. give 10 frames on each side with zero derivs, then +# ramping up to a weight of 1.0 over 10 frames. + +# _m is as _k but after a code change that makes the denominator FST more +# compact. I am rerunning in order to verify that the WER is not changed (since +# it's possible in principle that due to edge effects related to weight-pushing, +# the results could be a bit different). +# The results are inconsistently different but broadly the same. On all of eval2000, +# the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring. +# On the train_dev data, the change is 19.3->18.9 with tg LM and 17.6->17.6 after rescoring. + + +# _k is as _i but reverting the g->h change, removing the --scale-max-param-change +# option and setting max-param-change to 1.. Using the same egs. + +# _i is as _h but longer egs: 150 frames instead of 75, and +# 128 elements per minibatch instead of 256. + +# _h is as _g but different application of max-param-change (use --scale-max-param-change true) + +# _g is as _f but more splicing at last layer. + +# _f is as _e but with 30 as the number of left phone classes instead +# of 10. + +# _e is as _d but making it more similar in configuration to _b. +# (turns out b was better than a after all-- the egs' likelihoods had to +# be corrected before comparing them). +# the changes (vs. d) are: change num-pdfs target from 8k to 12k, +# multiply learning rates by 5, and set final-layer-normalize-target to 0.5. + +# _d is as _c but with a modified topology (with 4 distinct states per phone +# instead of 2), and a slightly larger num-states (8000) to compensate for the +# different topology, which has more states. + +# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0 +# as the default) as it's not clear that it was helpful; using the old learning-rates; +# and modifying the target-num-states to 7000. + +# _b is as as _a except for configuration changes: using 12k num-leaves instead of +# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5, +# which will make the final layer learn less fast compared with other layers. + +set -e + +# configs for 'chain' +stage=12 +train_stage=-10 +get_egs_stage=-10 +speed_perturb=true +dir=exp/chain/tdnn_6a # Note: _sp will get added to this if $speed_perturb == true. + +# training options +num_epochs=4 +initial_effective_lrate=0.001 +final_effective_lrate=0.0001 +leftmost_questions_truncate=-1 +max_param_change=2.0 +final_layer_normalize_target=0.5 +num_jobs_initial=3 +num_jobs_final=16 +minibatch_size=128 +frames_per_eg=150 +remove_egs=false + +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 11 ]; then + # Build a tree using our new topology. + steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ + --leftmost-questions-truncate $leftmost_questions_truncate \ + --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir +fi + +if [ $stage -le 12 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage + fi + + touch $dir/egs/.nodelete # keep egs around when that run dies. + + steps/nnet3/chain/train_tdnn.sh --stage $train_stage \ + --xent-regularize 0.1 \ + --leaky-hmm-coefficient 0.1 \ + --l2-regularize 0.00005 \ + --egs-dir exp/chain/tdnn_2y_sp/egs \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 1800 --final-hidden-dim 400 --jesus-hidden-dim 3000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3,skip0 -3,0,3,skip0 -3,0,3,skip0 -6,-3,0,skip-3" \ + --apply-deriv-weights false \ + --frames-per-iter 1200000 \ + --lm-opts "--num-extra-lm-states=2000" \ + --get-egs-stage $get_egs_stage \ + --minibatch-size $minibatch_size \ + --egs-opts "--frames-overlap-per-eg 0" \ + --frames-per-eg $frames_per_eg \ + --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \ + --feat-type raw \ + --online-ivector-dir exp/nnet3/ivectors_${train_set} \ + --cmvn-opts "--norm-means=false --norm-vars=false" \ + --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \ + --max-param-change $max_param_change \ + --cmd "$decode_cmd" \ + --remove-egs $remove_egs \ + data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir || exit 1; +fi + +if [ $stage -le 13 ]; then + # Note: it might appear that this $lang directory is mismatched, and it is as + # far as the 'topo' is concerned, but this script doesn't read the 'topo' from + # the lang directory. + utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg +fi + +decode_suff=sw1_tg +graph_dir=$dir/graph_sw1_tg +if [ $stage -le 14 ]; then + for decode_set in train_dev eval2000; do + ( + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context 20 \ + --nj 50 --cmd "$decode_cmd" \ + --online-ivector-dir exp/nnet3/ivectors_${decode_set} \ + $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1; + if $has_fisher; then + steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \ + data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \ + $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1; + fi + ) & + done +fi +wait; +exit 0; diff --git a/egs/wsj/s5/steps/nnet3/make_jesus_configs.py b/egs/wsj/s5/steps/nnet3/make_jesus_configs.py index d0008e81711..39ed9f961e0 100755 --- a/egs/wsj/s5/steps/nnet3/make_jesus_configs.py +++ b/egs/wsj/s5/steps/nnet3/make_jesus_configs.py @@ -112,7 +112,7 @@ def __init__(self, config_string, input_dim, input_name): self.input_dim = input_dim self.input_name = input_name - m = re.search("(mean|mean\+stddev)\((-?\d+):(-?\d+):(-?\d+):(-?\d+)\)", + m = re.match("^(mean|mean\+stddev)\((-?\d+):(-?\d+):(-?\d+):(-?\d+)\)$", config_string) if m == None: sys.exit("Invalid splice-index or statistics-config string: " + config_string) @@ -204,8 +204,9 @@ def WriteConfigs(self, f): try: x = StatisticsConfig(s, 100, 'foo') except: - sys.exit("The following element of the splicing array is not a valid specifier " - "of statistics: " + s) + if re.match("skip(-?\d+)$", s) == None: + sys.exit("The following element of the splicing array is not a valid specifier " + "of statistics or of the form skipDDD: " + s) if leftmost_splice == 10000 or rightmost_splice == -10000: sys.exit("invalid element of --splice-indexes: " + string) @@ -295,12 +296,21 @@ def WriteConfigs(self, f): splices.append('Offset({0}, {1})'.format(cur_output, offset)) spliced_dims.append(cur_affine_output_dim) except: - # it's not an integer offset, so assume it specifies the - # statistics-extraction. - stats = StatisticsConfig(s, cur_affine_output_dim, cur_output) - stats.WriteConfigs(f) - splices.append(stats.Descriptor()) - spliced_dims.extend(stats.OutputDims()) + # it's not an integer offset, so assume it either specifies the + # statistics-extraction, or is of the form skipXX where XX is an + # integer offset (this takes as input the previous post-jesus layer). + m = re.match("skip(-?\d+)$", s) + if m != None: + if l <= 2: + sys.exit("You cannot use skip-splicing for the 1st 2 layers") + offset = m.group(1) + splices.append("Offset(post-jesus{0}, {1})".format(l-1, offset)) + spliced_dims.append(args.jesus_forward_output_dim) + else: + stats = StatisticsConfig(s, cur_affine_output_dim, cur_output) + stats.WriteConfigs(f) + splices.append(stats.Descriptor()) + spliced_dims.extend(stats.OutputDims()) # get the input to the Jesus layer. cur_input = 'Append({0})'.format(', '.join(splices)) From d25785ddc1d24269cb71497092bfa209e9fbe84d Mon Sep 17 00:00:00 2001 From: Xingyu Na Date: Tue, 16 Feb 2016 10:48:25 +0800 Subject: [PATCH 09/32] small bug fix for fisher_swbd data prep --- egs/fisher_swbd/s5/local/swbd1_data_prep.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/egs/fisher_swbd/s5/local/swbd1_data_prep.sh b/egs/fisher_swbd/s5/local/swbd1_data_prep.sh index 0a8a375b7ed..98a12e1c0a3 100755 --- a/egs/fisher_swbd/s5/local/swbd1_data_prep.sh +++ b/egs/fisher_swbd/s5/local/swbd1_data_prep.sh @@ -102,7 +102,7 @@ local/swbd1_map_words.pl -f 2- $dir/transcripts2.txt > $dir/text # final trans # format acronyms in text python local/map_acronyms_transcripts.py -i $dir/text -o $dir/text_map \ - -M data/local/dict/acronyms_swbd.map + -M data/local/dict_nosp/acronyms_swbd.map cp $dir/text $dir/text_bk mv $dir/text_map $dir/text From e06745d58e24152071a72b04b11ca67d16c967c4 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Tue, 16 Feb 2016 02:20:35 -0500 Subject: [PATCH 10/32] adding some newer results for swbd+chain tuning; reverting skip-splicing option which I found not helpful. --- egs/swbd/s5c/local/chain/run_tdnn_5y.sh | 10 ++++++ egs/swbd/s5c/local/chain/run_tdnn_5z.sh | 9 ++++++ egs/swbd/s5c/local/chain/run_tdnn_6a.sh | 33 +++++++++++++++----- egs/wsj/s5/steps/nnet3/make_jesus_configs.py | 28 ++++++----------- 4 files changed, 54 insertions(+), 26 deletions(-) diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5y.sh b/egs/swbd/s5c/local/chain/run_tdnn_5y.sh index f89c1f5deac..54769c23734 100755 --- a/egs/swbd/s5c/local/chain/run_tdnn_5y.sh +++ b/egs/swbd/s5c/local/chain/run_tdnn_5y.sh @@ -11,6 +11,16 @@ # hidden parts of the network. Hopefully this will reduce overtraining, since # the hidden parts of the network are regularized by the --xent-regularize option. +# The diagnostics were improved, but the WER is no better (or maybe slightly worse). +#local/chain/compare_wer.sh 5v 5y +#System 5v 5y +#WER on train_dev(tg) 15.38 15.50 +#WER on train_dev(fg) 14.39 14.37 +#WER on eval2000(tg) 17.4 17.5 +#WER on eval2000(fg) 15.7 15.7 +#Final train prob -0.11156 -0.111636 +#Final valid prob -0.131797 -0.128892 + # _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500. # WER is almost the same, perhaps <0.1% worse; diagnostics are slightly worse. diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5z.sh b/egs/swbd/s5c/local/chain/run_tdnn_5z.sh index 0f3e89470d8..57910eb00c7 100755 --- a/egs/swbd/s5c/local/chain/run_tdnn_5z.sh +++ b/egs/swbd/s5c/local/chain/run_tdnn_5z.sh @@ -1,6 +1,15 @@ #!/bin/bash # _5z is as _5v, but adding skip-splicing (a new configuration option) +# It seems definitely not helpful. I'll remove the option soon. +#local/chain/compare_wer.sh 5v 5z +#System 5v 5z +#WER on train_dev(tg) 15.38 15.60 +#WER on train_dev(fg) 14.39 14.50 +#WER on eval2000(tg) 17.4 17.6 +#WER on eval2000(fg) 15.7 15.9 +#Final train prob -0.11156 -0.113823 +#Final valid prob -0.131797 -0.131356 # _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500. diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6a.sh b/egs/swbd/s5c/local/chain/run_tdnn_6a.sh index 70bd894f313..12589033819 100755 --- a/egs/swbd/s5c/local/chain/run_tdnn_6a.sh +++ b/egs/swbd/s5c/local/chain/run_tdnn_6a.sh @@ -1,10 +1,29 @@ #!/bin/bash -# _5z is as _5z, but adding the change in configuration that -# we made from _5v to _5y (moving some parameters from final layer -# to hidden parts of network) - -# _5z is as _5v, but adding skip-splicing (a new configuration option) +# _6a is as _5y, where we keep the hidden parts of the network a bit larger +# but take the final-hidden-dim back up to 500, which is the same as what +# it was in 5v. + +# _5y is as _5v, but rebalancing the network to have fewer parameters in the +# final layer and more in the hidden parts, by reducing --final-hidden-dim from 500 +# (it defaults to --jesus-forward-hidden-dim) to 400, and increasing +# --jesus-forward-input-dim from 500 to 600 and +# --jesus-forward-output-dim from 1700 to 1800, +# and --jesus-hidden-dim from 2500 to 3000 (note: I don't really expect this last change +# to make much of a difference). +# Very roughly, we're moving about a million parameters from the final layer to the +# hidden parts of the network. Hopefully this will reduce overtraining, since +# the hidden parts of the network are regularized by the --xent-regularize option. + +# The diagnostics were improved, but the WER is no better (or maybe slightly worse). +#local/chain/compare_wer.sh 5v 5y +#System 5v 5y +#WER on train_dev(tg) 15.38 15.50 +#WER on train_dev(fg) 14.39 14.37 +#WER on eval2000(tg) 17.4 17.5 +#WER on eval2000(fg) 15.7 15.7 +#Final train prob -0.11156 -0.111636 +#Final valid prob -0.131797 -0.128892 # _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500. @@ -412,8 +431,8 @@ if [ $stage -le 12 ]; then --leaky-hmm-coefficient 0.1 \ --l2-regularize 0.00005 \ --egs-dir exp/chain/tdnn_2y_sp/egs \ - --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 1800 --final-hidden-dim 400 --jesus-hidden-dim 3000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001" \ - --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3,skip0 -3,0,3,skip0 -3,0,3,skip0 -6,-3,0,skip-3" \ + --jesus-opts "--jesus-forward-input-dim 600 --jesus-forward-output-dim 1800 --final-hidden-dim 500 --jesus-hidden-dim 3000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001" \ + --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \ --apply-deriv-weights false \ --frames-per-iter 1200000 \ --lm-opts "--num-extra-lm-states=2000" \ diff --git a/egs/wsj/s5/steps/nnet3/make_jesus_configs.py b/egs/wsj/s5/steps/nnet3/make_jesus_configs.py index 39ed9f961e0..d0008e81711 100755 --- a/egs/wsj/s5/steps/nnet3/make_jesus_configs.py +++ b/egs/wsj/s5/steps/nnet3/make_jesus_configs.py @@ -112,7 +112,7 @@ def __init__(self, config_string, input_dim, input_name): self.input_dim = input_dim self.input_name = input_name - m = re.match("^(mean|mean\+stddev)\((-?\d+):(-?\d+):(-?\d+):(-?\d+)\)$", + m = re.search("(mean|mean\+stddev)\((-?\d+):(-?\d+):(-?\d+):(-?\d+)\)", config_string) if m == None: sys.exit("Invalid splice-index or statistics-config string: " + config_string) @@ -204,9 +204,8 @@ def WriteConfigs(self, f): try: x = StatisticsConfig(s, 100, 'foo') except: - if re.match("skip(-?\d+)$", s) == None: - sys.exit("The following element of the splicing array is not a valid specifier " - "of statistics or of the form skipDDD: " + s) + sys.exit("The following element of the splicing array is not a valid specifier " + "of statistics: " + s) if leftmost_splice == 10000 or rightmost_splice == -10000: sys.exit("invalid element of --splice-indexes: " + string) @@ -296,21 +295,12 @@ def WriteConfigs(self, f): splices.append('Offset({0}, {1})'.format(cur_output, offset)) spliced_dims.append(cur_affine_output_dim) except: - # it's not an integer offset, so assume it either specifies the - # statistics-extraction, or is of the form skipXX where XX is an - # integer offset (this takes as input the previous post-jesus layer). - m = re.match("skip(-?\d+)$", s) - if m != None: - if l <= 2: - sys.exit("You cannot use skip-splicing for the 1st 2 layers") - offset = m.group(1) - splices.append("Offset(post-jesus{0}, {1})".format(l-1, offset)) - spliced_dims.append(args.jesus_forward_output_dim) - else: - stats = StatisticsConfig(s, cur_affine_output_dim, cur_output) - stats.WriteConfigs(f) - splices.append(stats.Descriptor()) - spliced_dims.extend(stats.OutputDims()) + # it's not an integer offset, so assume it specifies the + # statistics-extraction. + stats = StatisticsConfig(s, cur_affine_output_dim, cur_output) + stats.WriteConfigs(f) + splices.append(stats.Descriptor()) + spliced_dims.extend(stats.OutputDims()) # get the input to the Jesus layer. cur_input = 'Append({0})'.format(', '.join(splices)) From 341e0f023a5083d3c31e207097be1ca254bf1e80 Mon Sep 17 00:00:00 2001 From: Gaurav Kumar Date: Tue, 16 Feb 2016 20:00:45 -0500 Subject: [PATCH 11/32] Changes to allow the large spanish word list to be downloaded if not present --- .../s5/local/fsp_prepare_dict.sh | 30 ++++++++++-- .../s5/local/merge_lexicons.py | 47 ++++++++++--------- 2 files changed, 50 insertions(+), 27 deletions(-) diff --git a/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh b/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh index 0f2bd037ba0..824edd99da8 100755 --- a/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh +++ b/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh @@ -22,12 +22,32 @@ lexicon=$1 #Get all unique words, remove punctuation. if [ $stage -le 0 ]; then cat $datadir/text | sed 's:[0-9][0-9]\S*::g' | sed 's:[\.,\?]::g' | tr " " "\n" | sort | uniq | awk '{if (NF > 0){ print; }}' > $tmpdir/uniquewords - if [ -f "/export/a04/gkumar/corpora/gigaword-spanish/bin/gigaword-lexicon.json" ]; then - # Merge with gigaword corpus - $local/merge_lexicons.py - mv $tmpdir/uniquewords $tmpdir/uniquewords.small - mv $tmpdir/uniquewords64k $tmpdir/uniquewords + if [ -f "${tmpdir}/es_wordlist.json" ]; then + echo "Could not find the large collection of Spanish words es_wordlist.json" + echo "Trying to download it via wget" + + if ! which wget >&/dev/null; then + echo "This script requires you to first install wget" + exit 1; + fi + + cwd=`pwd` + cd $tmpdir + wget -T 10 -t 3 -c http://www.openslr.org/resources/21/es_wordlist.json.tgz + + if [ ! -e ${tmpdir}/es_wordlist.json.tgz ]; then + echo "Download of the large Spanish word list failed" + exit 1; + fi + + tar -xvzfo es_wordlist.json.tgz || exit 1; + cd $cwd fi + + # Merge with gigaword corpus + $local/merge_lexicons.py ${tmpdir} ${lexicon} + mv $tmpdir/uniquewords $tmpdir/uniquewords.small + mv $tmpdir/uniquewords64k $tmpdir/uniquewords fi #Then get the list of phones form basic_rules in the lexicon folder diff --git a/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py b/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py index 8c67ae56804..5c09f09bc35 100755 --- a/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py +++ b/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py @@ -7,55 +7,58 @@ import sys import json import codecs -import os import operator -wordlimit=64000 -uw_fisher="data/local/tmp/uniquewords" -uw_gigaword="/export/a04/gkumar/corpora/gigaword-spanish/bin/gigaword-lexicon.json" -uw_LDC="/export/corpora/LDC/LDC96L16/callhome_spanish_lexicon_970908/preferences" +wordlimit = 64000 +tmpdir = sys.argv[1] +ldc_lexicon = sys.argv[2] +uw_fisher = tmpdir + "/uniquewords" +uw_gigaword = tmpdir + "/es_wordlist.json" +uw_LDC = ldc_lexicon + "/callhome_spanish_lexicon_970908/preferences" merged_lexicon = [] # All three lexicons are in different formats # First add the data from lexicon_fisher (A) into the dictionary fisher = codecs.open(uw_fisher, encoding='utf-8') for line in fisher: - merged_lexicon.append(line.strip()) + merged_lexicon.append(line.strip()) fisher.close() -print "After adding the fisher data, the lexicon contains " + str(len(merged_lexicon)) + " entries." +print "After adding the fisher data, the lexicon contains " \ + + str(len(merged_lexicon)) + " entries." # Now add data from the LDC lexicon ldc = codecs.open(uw_LDC, encoding='iso-8859-1') -for line in ldc: - entries = line.strip().split('\t') - if entries[0].lower() not in merged_lexicon: - merged_lexicon.append(entries[0].lower()) +for line in ldc: + entries = line.strip().split('\t') + if entries[0].lower() not in merged_lexicon: + merged_lexicon.append(entries[0].lower()) -print "After adding the LDC data, the lexicon contains " + str(len(merged_lexicon)) + " entries." +print "After adding the LDC data, the lexicon contains " \ + + str(len(merged_lexicon)) + " entries." # Finally add the gigaword data gigaword = json.load(open(uw_gigaword)) gigaword = reversed(sorted(gigaword.iteritems(), key=operator.itemgetter(1))) for item in gigaword: - # We need a maximum of wordlimit words in the lexicon - if len(merged_lexicon) == wordlimit: - break + # We need a maximum of wordlimit words in the lexicon + if len(merged_lexicon) == wordlimit: + break - if item[0].lower() not in merged_lexicon: - merged_lexicon.append(item[0].lower()) - -print "After adding the Gigaword data, the lexicon contains " + str(len(merged_lexicon)) + " entries." + if item[0].lower() not in merged_lexicon: + merged_lexicon.append(item[0].lower()) + +print "After adding the Gigaword data, the lexicon contains " \ + + str(len(merged_lexicon)) + " entries." # Now write the uniquewords to a file -lf = codecs.open('data/local/tmp/uniquewords64k', encoding='utf-8', mode='w+') +lf = codecs.open(tmpdir + '/uniquewords64k', encoding='utf-8', mode='w+') ltuples = sorted(merged_lexicon) for item in ltuples: - lf.write(item + "\n") + lf.write(item + "\n") lf.close() print "Finshed writing unique words" - From b7aa6b126e93806cb72036f4be9db95d4bb002e4 Mon Sep 17 00:00:00 2001 From: Gaurav Kumar Date: Tue, 16 Feb 2016 22:15:36 -0500 Subject: [PATCH 12/32] Small changes. Fixes #494 --- egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh b/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh index 824edd99da8..dae46cfddf5 100755 --- a/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh +++ b/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh @@ -22,7 +22,7 @@ lexicon=$1 #Get all unique words, remove punctuation. if [ $stage -le 0 ]; then cat $datadir/text | sed 's:[0-9][0-9]\S*::g' | sed 's:[\.,\?]::g' | tr " " "\n" | sort | uniq | awk '{if (NF > 0){ print; }}' > $tmpdir/uniquewords - if [ -f "${tmpdir}/es_wordlist.json" ]; then + if [ ! -f "${tmpdir}/es_wordlist.json" ]; then echo "Could not find the large collection of Spanish words es_wordlist.json" echo "Trying to download it via wget" @@ -40,7 +40,7 @@ if [ $stage -le 0 ]; then exit 1; fi - tar -xvzfo es_wordlist.json.tgz || exit 1; + tar -xovzf es_wordlist.json.tgz || exit 1; cd $cwd fi From 577659a6836c4419a417aac6e9d9d7a659ccd3af Mon Sep 17 00:00:00 2001 From: Xingyu Na Date: Wed, 17 Feb 2016 12:27:38 +0800 Subject: [PATCH 13/32] fix swbd1 data prep duplicates --- .../s5/local/swbd1_data_download.sh | 17 ++-------------- egs/fisher_swbd/s5/local/swbd1_data_prep.sh | 20 +------------------ egs/swbd/s5c/local/swbd1_data_download.sh | 17 ++-------------- egs/swbd/s5c/local/swbd1_data_prep.sh | 19 +----------------- 4 files changed, 6 insertions(+), 67 deletions(-) diff --git a/egs/fisher_swbd/s5/local/swbd1_data_download.sh b/egs/fisher_swbd/s5/local/swbd1_data_download.sh index 6dac146c26b..95c9d5e58a4 100755 --- a/egs/fisher_swbd/s5/local/swbd1_data_download.sh +++ b/egs/fisher_swbd/s5/local/swbd1_data_download.sh @@ -10,18 +10,11 @@ ## you unpacked this. We are just doing a "find" command to locate ## the .sph files. -## The second input is optional, which should point to a directory containing -## Switchboard transcriptions/documentations (specifically, the conv.tab file). -## If specified, the script will try to use the actual speaker PINs provided -## with the corpus instead of the conversation side ID (Kaldi default). We -## will be using "find" to locate this file so we don't make any assumptions -## on the directory structure. (Peng Qi, Aug 2014) - . path.sh #check existing directories -if [ $# != 1 -a $# != 2 ]; then - echo "Usage: swbd1_data_prep_edin.sh /path/to/SWBD [/path/to/SWBD_DOC]" +if [ $# != 1 ]; then + echo "Usage: swbd1_data_download.sh /path/to/SWBD" exit 1; fi @@ -30,18 +23,12 @@ SWBD_DIR=$1 dir=data/local/train_swbd mkdir -p $dir - # Audio data directory check if [ ! -d $SWBD_DIR ]; then echo "Error: run.sh requires a directory argument" exit 1; fi -sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe -[ ! -x $sph2pipe ] \ - && echo "Could not execute the sph2pipe program at $sph2pipe" && exit 1; - - # Trans directory check if [ ! -d $SWBD_DIR/transcriptions/swb_ms98_transcriptions ]; then ( diff --git a/egs/fisher_swbd/s5/local/swbd1_data_prep.sh b/egs/fisher_swbd/s5/local/swbd1_data_prep.sh index 98a12e1c0a3..54513437dbe 100755 --- a/egs/fisher_swbd/s5/local/swbd1_data_prep.sh +++ b/egs/fisher_swbd/s5/local/swbd1_data_prep.sh @@ -14,7 +14,7 @@ #check existing directories if [ $# != 1 ]; then - echo "Usage: swbd1_data_prep_edin.sh /path/to/SWBD" + echo "Usage: swbd1_data_prep.sh /path/to/SWBD" exit 1; fi @@ -23,7 +23,6 @@ SWBD_DIR=$1 dir=data/local/train_swbd mkdir -p $dir - # Audio data directory check if [ ! -d $SWBD_DIR ]; then echo "Error: run.sh requires a directory argument" @@ -34,23 +33,6 @@ sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe [ ! -x $sph2pipe ] \ && echo "Could not execute the sph2pipe program at $sph2pipe" && exit 1; - -# Trans directory check -if [ ! -d $SWBD_DIR/transcriptions/swb_ms98_transcriptions ]; then - # To get the SWBD transcriptions and dict, do: - echo " *** Downloading transcriptions and dictionary ***" - ( - cd $dir; - wget http://www.openslr.org/resources/5/switchboard_word_alignments.tar.gz || - wget http://www.isip.piconepress.com/projects/switchboard/releases/switchboard_word_alignments.tar.gz - tar -xf switchboard_word_alignments.tar.gz - ) -else - echo "Directory with transcriptions exists, skipping downloading" - [ -f $dir/swb_ms98_transcriptions ] \ - || ln -sf $SWBD_DIR/transcriptions/swb_ms98_transcriptions $dir/ -fi - # Option A: SWBD dictionary file check [ ! -f $dir/swb_ms98_transcriptions/sw-ms98-dict.text ] && \ echo "SWBD dictionary file does not exist" && exit 1; diff --git a/egs/swbd/s5c/local/swbd1_data_download.sh b/egs/swbd/s5c/local/swbd1_data_download.sh index dd3559d2b45..d8f076b5141 100755 --- a/egs/swbd/s5c/local/swbd1_data_download.sh +++ b/egs/swbd/s5c/local/swbd1_data_download.sh @@ -10,18 +10,11 @@ ## you unpacked this. We are just doing a "find" command to locate ## the .sph files. -## The second input is optional, which should point to a directory containing -## Switchboard transcriptions/documentations (specifically, the conv.tab file). -## If specified, the script will try to use the actual speaker PINs provided -## with the corpus instead of the conversation side ID (Kaldi default). We -## will be using "find" to locate this file so we don't make any assumptions -## on the directory structure. (Peng Qi, Aug 2014) - . path.sh #check existing directories -if [ $# != 1 -a $# != 2 ]; then - echo "Usage: swbd1_data_prep_edin.sh /path/to/SWBD [/path/to/SWBD_DOC]" +if [ $# != 1 ]; then + echo "Usage: swbd1_data_download.sh /path/to/SWBD" exit 1; fi @@ -30,18 +23,12 @@ SWBD_DIR=$1 dir=data/local/train mkdir -p $dir - # Audio data directory check if [ ! -d $SWBD_DIR ]; then echo "Error: run.sh requires a directory argument" exit 1; fi -sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe -[ ! -x $sph2pipe ] \ - && echo "Could not execute the sph2pipe program at $sph2pipe" && exit 1; - - # Trans directory check if [ ! -d $SWBD_DIR/transcriptions/swb_ms98_transcriptions ]; then ( diff --git a/egs/swbd/s5c/local/swbd1_data_prep.sh b/egs/swbd/s5c/local/swbd1_data_prep.sh index 57fb0ff56c8..9621e7fc06e 100755 --- a/egs/swbd/s5c/local/swbd1_data_prep.sh +++ b/egs/swbd/s5c/local/swbd1_data_prep.sh @@ -21,7 +21,7 @@ #check existing directories if [ $# != 1 -a $# != 2 ]; then - echo "Usage: swbd1_data_prep_edin.sh /path/to/SWBD [/path/to/SWBD_DOC]" + echo "Usage: swbd1_data_prep.sh /path/to/SWBD [/path/to/SWBD_DOC]" exit 1; fi @@ -41,23 +41,6 @@ sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe [ ! -x $sph2pipe ] \ && echo "Could not execute the sph2pipe program at $sph2pipe" && exit 1; - -# Trans directory check -if [ ! -d $SWBD_DIR/transcriptions/swb_ms98_transcriptions ]; then - ( - cd $dir; - if [ ! -d swb_ms98_transcriptions ]; then - echo " *** Downloading trascriptions and dictionary ***" - wget http://www.isip.piconepress.com/projects/switchboard/releases/switchboard_word_alignments.tar.gz - tar -xf switchboard_word_alignments.tar.gz - fi - ) -else - echo "Directory with transcriptions exists, skipping downloading" - [ -f $dir/swb_ms98_transcriptions ] \ - || ln -sf $SWBD_DIR/transcriptions/swb_ms98_transcriptions $dir/ -fi - # Option A: SWBD dictionary file check [ ! -f $dir/swb_ms98_transcriptions/sw-ms98-dict.text ] && \ echo "SWBD dictionary file does not exist" && exit 1; From 92994e2c9a6b60f3390ab19081b1759aa2caaa74 Mon Sep 17 00:00:00 2001 From: Gaurav Kumar Date: Wed, 17 Feb 2016 03:44:52 -0500 Subject: [PATCH 14/32] Handle multiple pronunciations in lexicon. Fixes #506 --- egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh b/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh index dae46cfddf5..6d04f53c7e5 100755 --- a/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh +++ b/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh @@ -70,6 +70,7 @@ if [ $stage -le 2 ]; then # representation cat $tmpdir/uniquewords | $local/spron.pl $lexicon/callhome_spanish_lexicon_970908/preferences $lexicon/callhome_spanish_lexicon_970908/basic_rules \ | cut -f1 | sed -r 's:#\S+\s\S+\s\S+\s\S+\s(\S+):\1:g' \ + | awk -F '[/][/]' '{print $1}' \ > $tmpdir/lexicon_raw fi From a059643d198777975a58cce5816fd5e1d642963a Mon Sep 17 00:00:00 2001 From: vesis84 Date: Wed, 17 Feb 2016 11:49:47 +0100 Subject: [PATCH 15/32] updating 'cmd.sh' for BUT cluster in various recipes, --- egs/ami/s5/cmd.sh | 6 +++--- egs/rm/s5/cmd.sh | 2 +- egs/swbd/s5c/cmd.sh | 2 +- egs/tedlium/s5/cmd.sh | 2 +- egs/timit/s5/cmd.sh | 16 ++++++++-------- egs/wsj/s5/cmd.sh | 6 +++--- 6 files changed, 17 insertions(+), 17 deletions(-) diff --git a/egs/ami/s5/cmd.sh b/egs/ami/s5/cmd.sh index 9bc2b3195ef..c3ac80d6846 100644 --- a/egs/ami/s5/cmd.sh +++ b/egs/ami/s5/cmd.sh @@ -28,10 +28,10 @@ export cntk_decode_cmd="queue.pl -l arch=*64* --mem 1G -pe smp 2" if [ "$(hostname -d)" == "fit.vutbr.cz" ]; then # BUT cluster: queue="all.q@@blade,all.q@@speech" - gpu_queue="long.q@supergpu*,long.q@dellgpu*,long.q@pcspeech-gpu,long.q@pcgpu*" + gpu_queue="long.q@@gpu" storage="matylda5" - export train_cmd="queue.pl -q $queue -l ram_free=1500M,mem_free=1500M,${storage}=1" - export decode_cmd="queue.pl -q $queue -l ram_free=2500M,mem_free=2500M,${storage}=0.5" + export train_cmd="queue.pl -q $queue -l ram_free=1.5G,mem_free=1.5G,${storage}=1" + export decode_cmd="queue.pl -q $queue -l ram_free=2.5G,mem_free=2.5G,${storage}=0.5" export cuda_cmd="queue.pl -q $gpu_queue -l gpu=1" fi diff --git a/egs/rm/s5/cmd.sh b/egs/rm/s5/cmd.sh index 4478796305e..4d009813fd2 100644 --- a/egs/rm/s5/cmd.sh +++ b/egs/rm/s5/cmd.sh @@ -22,7 +22,7 @@ cuda_cmd="queue.pl -l arch=*64 -l gpu=1" # BUT cluster: if [ "$(hostname -d)" == "fit.vutbr.cz" ]; then queue="all.q@@blade,all.q@@speech" - gpu_queue="long.q@supergpu*,long.q@dellgpu*,long.q@pcspeech-gpu,long.q@pcgpu*" + gpu_queue="long.q@@gpu" storage="matylda5" export train_cmd="queue.pl -q $queue -l ram_free=1500M,mem_free=1500M,${storage}=1" export decode_cmd="queue.pl -q $queue -l ram_free=2500M,mem_free=2500M,${storage}=0.5" diff --git a/egs/swbd/s5c/cmd.sh b/egs/swbd/s5c/cmd.sh index 3f7de21e279..3dfaceaafab 100644 --- a/egs/swbd/s5c/cmd.sh +++ b/egs/swbd/s5c/cmd.sh @@ -15,7 +15,7 @@ export mkgraph_cmd="queue.pl -l arch=*64* --mem 4G" if [ "$(hostname -d)" == "fit.vutbr.cz" ]; then # BUT cluster: queue="all.q@@blade,all.q@@speech" - gpu_queue="long.q@supergpu*,long.q@dellgpu*,long.q@pcspeech-gpu,long.q@pcgpu*" + gpu_queue="long.q@@gpu" storage="matylda5" export train_cmd="queue.pl -q $queue -l ram_free=1.5G,mem_free=1.5G,${storage}=0.25" export decode_cmd="queue.pl -q $queue -l ram_free=2.5G,mem_free=2.5G,${storage}=0.1" diff --git a/egs/tedlium/s5/cmd.sh b/egs/tedlium/s5/cmd.sh index bed97d34020..ba7f120e599 100644 --- a/egs/tedlium/s5/cmd.sh +++ b/egs/tedlium/s5/cmd.sh @@ -19,7 +19,7 @@ host=$(hostname -f) if [ ${host#*.} == "fit.vutbr.cz" ]; then # BUT cluster: queue="all.q@@blade,all.q@@speech" - gpu_queue="long.q@supergpu*,long.q@dellgpu*,long.q@pcspeech-gpu,long.q@pcgpu*" + gpu_queue="long.q@@gpu" storage="matylda5" export train_cmd="queue.pl -q $queue -l ram_free=1500M,mem_free=1500M,${storage}=1" export decode_cmd="queue.pl -q $queue -l ram_free=2500M,mem_free=2500M,${storage}=0.5" diff --git a/egs/timit/s5/cmd.sh b/egs/timit/s5/cmd.sh index fd91a53ff73..0150f486298 100644 --- a/egs/timit/s5/cmd.sh +++ b/egs/timit/s5/cmd.sh @@ -12,18 +12,18 @@ #export cuda_cmd=run.pl -if [[ $(hostname -f) == *.clsp.jhu.edu ]]; then +if [ "$(hostname -d)" == "clsp.jhu.edu" ]; then export train_cmd="queue.pl -l arch=*64*" export decode_cmd="queue.pl -l arch=*64* --mem 3G" - export mkgraph_cmd="queue.pl -l arch=*64* --mem 4G" export cuda_cmd="queue.pl -l gpu=1" -elif [[ $(hostname -f) == *.fit.vutbr.cz ]]; then +elif [ "$(hostname -d)" == "fit.vutbr.cz" ]; then #b) BUT cluster options - queue="all.q@@blade,all.q@@speech,all.q@dellgpu*,all.q@supergpu*" - export train_cmd="queue.pl -q $queue -l ram_free=2500M,mem_free=2500M,matylda5=0.5" - export decode_cmd="queue.pl -q $queue -l ram_free=3000M,mem_free=3000M,matylda5=0.1" - export mkgraph_cmd="queue.pl -q $queue -l ram_free=4G,mem_free=4G,matylda5=3" - export cuda_cmd="queue.pl -q long.q@pcspeech-gpu,long.q@dellgpu1,long.q@pcgpu*,long.q@supergpu1 -l gpu=1" + queue="all.q@@blade,all.q@@speech" + gpu_queue="long.q@@gpu" + storage="matylda5" + export train_cmd="queue.pl -q $queue -l ram_free=1.5G,mem_free=1.5G,${storage}=0.5" + export decode_cmd="queue.pl -q $queue -l ram_free=2.5G,mem_free=2.5G,${storage}=0.1" + export cuda_cmd="queue.pl -q $gpu_queue -l gpu=1" else echo "$0: you need to define options for your cluster." exit 1; diff --git a/egs/wsj/s5/cmd.sh b/egs/wsj/s5/cmd.sh index e5e8f9d26d4..96c48af42c1 100644 --- a/egs/wsj/s5/cmd.sh +++ b/egs/wsj/s5/cmd.sh @@ -21,9 +21,9 @@ export cuda_cmd="queue.pl -l gpu=1" #c) BUT cluster: if [ "$(hostname -d)" == "fit.vutbr.cz" ]; then queue="all.q@@blade,all.q@@speech" - gpu_queue="long.q@supergpu*,long.q@dellgpu*,long.q@pcspeech-gpu,long.q@pcgpu*" + gpu_queue="long.q@@gpu" storage="matylda5" - export train_cmd="queue.pl -q $queue -l ram_free=1500M,mem_free=1500M,${storage}=1" - export decode_cmd="queue.pl -q $queue -l ram_free=2500M,mem_free=2500M,${storage}=0.5" + export train_cmd="queue.pl -q $queue -l ram_free=1.5G,mem_free=1.5G,${storage}=1" + export decode_cmd="queue.pl -q $queue -l ram_free=2.5G,mem_free=2.5G,${storage}=0.5" export cuda_cmd="queue.pl -q $gpu_queue -l gpu=1" fi From 04cd90211f8dffe4fccbf63de92dfddb86e423a3 Mon Sep 17 00:00:00 2001 From: vesis84 Date: Wed, 17 Feb 2016 12:22:42 +0100 Subject: [PATCH 16/32] fixing tidigits data preparation, --- egs/tidigits/s5/local/tidigits_prepare_lang.sh | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/egs/tidigits/s5/local/tidigits_prepare_lang.sh b/egs/tidigits/s5/local/tidigits_prepare_lang.sh index ff316514fc9..0bc08ab40a0 100755 --- a/egs/tidigits/s5/local/tidigits_prepare_lang.sh +++ b/egs/tidigits/s5/local/tidigits_prepare_lang.sh @@ -88,10 +88,11 @@ utils/make_lexicon_fst.pl $tmpdir/lexicon.txt 0.5 sil | \ cp $lang/L.fst $lang/L_disambig.fst -silphonelist=`cat $lang/phones/silence.csl | sed 's/:/ /g'` -nonsilphonelist=`cat $lang/phones/nonsilence.csl | sed 's/:/ /g'` -cat conf/topo.proto | sed "s:NONSILENCEPHONES:$nonsilphonelist:" | \ - sed "s:SILENCEPHONES:$silphonelist:" > $lang/topo +num_sil_states=5 +num_nonsil_states=3 +silphonelist=`cat $lang/phones/silence.csl` +nonsilphonelist=`cat $lang/phones/nonsilence.csl` +utils/gen_topo.pl $num_nonsil_states $num_sil_states $nonsilphonelist $silphonelist >$lang/topo # Now we prepare a simple grammar G.fst that's a kind of loop of # digits (no silence in this, since that's handled in L.fst) From 2646cfb6fda83a358f27d06b0ec4b2cee95ef264 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Wed, 17 Feb 2016 18:11:25 -0500 Subject: [PATCH 17/32] chain+swbd experiments: tuning-experiment results --- egs/swbd/s5c/local/chain/run_tdnn_6a.sh | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6a.sh b/egs/swbd/s5c/local/chain/run_tdnn_6a.sh index 12589033819..c618d1c0adf 100755 --- a/egs/swbd/s5c/local/chain/run_tdnn_6a.sh +++ b/egs/swbd/s5c/local/chain/run_tdnn_6a.sh @@ -4,6 +4,16 @@ # but take the final-hidden-dim back up to 500, which is the same as what # it was in 5v. +# No better. +#local/chain/compare_wer.sh 5v 6a +#System 5v 6a +#WER on train_dev(tg) 15.38 15.49 +#WER on train_dev(fg) 14.39 14.30 +#WER on eval2000(tg) 17.4 17.5 +#WER on eval2000(fg) 15.7 15.9 +#Final train prob -0.11156 -0.109471 +#Final valid prob -0.131797 -0.129035 + # _5y is as _5v, but rebalancing the network to have fewer parameters in the # final layer and more in the hidden parts, by reducing --final-hidden-dim from 500 # (it defaults to --jesus-forward-hidden-dim) to 400, and increasing From 187fa16fcfb4c1b55405717506964fe0d2245eb8 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Wed, 17 Feb 2016 18:38:42 -0500 Subject: [PATCH 18/32] chain branch: changing the self-repair code so that it should work well for sigmoid and tanh, although this is not tested yet. some other bug-fixes. --- src/nnet3/nnet-component-itf.cc | 57 +------- src/nnet3/nnet-component-itf.h | 7 - src/nnet3/nnet-compute-test.cc | 23 ++-- src/nnet3/nnet-general-component.h | 20 +-- src/nnet3/nnet-simple-component.cc | 206 +++++++++++++++++++++++++++-- src/nnet3/nnet-simple-component.h | 14 ++ src/nnet3/nnet-test-utils.cc | 4 +- 7 files changed, 234 insertions(+), 97 deletions(-) diff --git a/src/nnet3/nnet-component-itf.cc b/src/nnet3/nnet-component-itf.cc index bbab3d3ba81..cdb43473090 100644 --- a/src/nnet3/nnet-component-itf.cc +++ b/src/nnet3/nnet-component-itf.cc @@ -55,7 +55,7 @@ ComponentPrecomputedIndexes* ComponentPrecomputedIndexes::NewComponentPrecompute ans = new StatisticsExtractionComponentPrecomputedIndexes(); } else if (cpi_type == "StatisticsPoolingComponentPrecomputedIndexes") { ans = new StatisticsPoolingComponentPrecomputedIndexes(); - } + } if (ans != NULL) { KALDI_ASSERT(cpi_type == ans->Type()); } @@ -428,61 +428,6 @@ void NonlinearComponent::InitFromConfig(ConfigLine *cfl) { } -void NonlinearComponent::RepairGradients( - bool measure_deriv, - BaseFloat default_lower_threshold, - BaseFloat default_upper_threshold, - CuMatrixBase *in_deriv) const { - const CuVector &stats_src = (measure_deriv ? deriv_sum_ : value_sum_); - if (self_repair_scale_ == 0.0 || count_ == 0.0 || stats_src.Dim() != dim_) - return; - // we use this 'repair_probability' (hardcoded for now) to limit - // this code to running on about half of the minibatches. - BaseFloat repair_probability = 0.5; - if (RandUniform() > repair_probability) - return; - - // check that the self-repair scale is in a reasonable range. - KALDI_ASSERT(self_repair_scale_ > 0.0 && self_repair_scale_ < 0.1); - BaseFloat unset = kUnsetThreshold; // -1000.0 - BaseFloat lower_threshold = (self_repair_lower_threshold_ == unset ? - default_lower_threshold : - self_repair_lower_threshold_) * - count_, - upper_threshold = (self_repair_upper_threshold_ == unset ? - default_upper_threshold : - self_repair_upper_threshold_) * - count_; - - CuMatrix storage(2, dim_ + 2, kUndefined); - CuSubVector thresholds_vec(storage.RowData(0) + dim_, 2); - CuSubMatrix stats_mat(storage, 0, 2, 0, dim_); - thresholds_vec(0) = -lower_threshold; - thresholds_vec(1) = -upper_threshold; - CuSubVector row0(stats_mat, 0); - CuSubVector row1(stats_mat, 1); - - row0.CopyFromVec(stats_src); - row1.CopyFromVec(row0); - stats_mat.AddVecToCols(1.0, thresholds_vec, 1.0); - // now row0 equals stats - lower_threshold, and - // row1 equals stats - upper_threshold. - stats_mat.ApplyHeaviside(); - // now row0 equals (stats > lower_threshold ? 1 : 0), and - // row1 equals (stats > upper_threshold ? 1 : 0). - // what we want is: - // self_repair_scale * ((stats <= lower_threshold ? 1 : 0) + - // (stats > upper_threshold ? -1 : 0)). - // - // we can get these in stats_mat.Row(0) by computing: - // -self_repair_scale * (stats_mat.Row(1) + stats_mat.Row(0) - 1). - row0.AddVec(1.0, row1, 1.0); - row0.Add(-1.0); - // [actually we need to divide by repair_probability also, to - // correct for the fact that we only do this on some frames.] - row0.Scale(-self_repair_scale_ / repair_probability); - in_deriv->AddVecToRows(1.0, row0, 1.0); -} } // namespace nnet3 } // namespace kaldi diff --git a/src/nnet3/nnet-component-itf.h b/src/nnet3/nnet-component-itf.h index 6ee702b7797..be78014c20b 100644 --- a/src/nnet3/nnet-component-itf.h +++ b/src/nnet3/nnet-component-itf.h @@ -494,13 +494,6 @@ class NonlinearComponent: public Component { protected: enum { kUnsetThreshold = -1000 }; - // this function is to be called from Backprop code if it makes - // sense for the nonlinearity typte - void RepairGradients(bool measure_deriv, - BaseFloat default_lower_threshold, - BaseFloat default_upper_threshold, - CuMatrixBase *in_deriv) const; - friend class SigmoidComponent; friend class TanhComponent; friend class SoftmaxComponent; diff --git a/src/nnet3/nnet-compute-test.cc b/src/nnet3/nnet-compute-test.cc index 33e9ede3812..7fdb3dab982 100644 --- a/src/nnet3/nnet-compute-test.cc +++ b/src/nnet3/nnet-compute-test.cc @@ -34,19 +34,16 @@ void UnitTestNnetComputationIo(NnetComputation *computation) { computation->Write(os, binary); const std::string &original_output = os.str(); std::istringstream computation_is(original_output); - KALDI_LOG << computation_is.str(); computation->Read(computation_is, binary); std::istringstream computation_is2(original_output); NnetComputation computation2; computation2.Read(computation_is2, binary); - + std::ostringstream os2, os3; computation->Write(os2, binary); computation2.Write(os3, binary); - + if (binary) { - KALDI_LOG << os2.str(); - KALDI_LOG << original_output; KALDI_ASSERT(os2.str() == original_output); KALDI_ASSERT(os3.str() == original_output); } @@ -62,15 +59,13 @@ void UnitTestComputationRequestIo(ComputationRequest *request) { std::istringstream request_is2(original_output); ComputationRequest request2; request2.Read(request_is2, binary); - + std::ostringstream os2, os3; request->Write(os2, binary); request2.Write(os3, binary); KALDI_ASSERT(*request == request2); if (binary) { - KALDI_LOG << os2.str(); - KALDI_LOG << original_output; KALDI_ASSERT(os2.str() == original_output); KALDI_ASSERT(os3.str() == original_output); } @@ -86,10 +81,10 @@ void TestNnetDecodable(const ComputationRequest &request, } void UnitTestNnetCompute() { - for (int32 n = 0; n < 20; n++) { + for (int32 n = 0; n < 20; n++) { struct NnetGenerationOptions gen_config; - + std::vector configs; GenerateConfigSequence(gen_config, &configs); Nnet nnet; @@ -102,7 +97,7 @@ void UnitTestNnetCompute() { ComputationRequest request; std::vector > inputs; ComputeExampleComputationRequestSimple(nnet, &request, &inputs); - + NnetComputation computation; Compiler compiler(request, nnet); @@ -117,7 +112,7 @@ void UnitTestNnetCompute() { } CheckComputationOptions check_config; // we can do the rewrite check since it's before optimization. - check_config.check_rewrite = true; + check_config.check_rewrite = true; ComputationChecker checker(check_config, nnet, computation); checker.Check(); @@ -135,7 +130,7 @@ void UnitTestNnetCompute() { NnetComputeOptions compute_opts; if (RandInt(0, 1) == 0) compute_opts.debug = true; - + computation.ComputeCudaIndexes(); NnetComputer computer(compute_opts, computation, @@ -151,7 +146,7 @@ void UnitTestNnetCompute() { const CuMatrixBase &output(computer.GetOutput("output")); TestNnetDecodable(request, inputs, nnet, output); - + KALDI_LOG << "Output sum is " << output.Sum(); CuMatrix output_deriv(output.NumRows(), output.NumCols()); output_deriv.SetRandn(); diff --git a/src/nnet3/nnet-general-component.h b/src/nnet3/nnet-general-component.h index edf6b993ddc..e7c2ff3a78e 100644 --- a/src/nnet3/nnet-general-component.h +++ b/src/nnet3/nnet-general-component.h @@ -142,11 +142,11 @@ class DistributeComponentPrecomputedIndexes: virtual ComponentPrecomputedIndexes* Copy() const { return new DistributeComponentPrecomputedIndexes(*this); } - + virtual void Write(std::ostream &ostream, bool binary) const; - + virtual void Read(std::istream &istream, bool binary); - + virtual std::string Type() const { return "DistributeComponentPrecomputedIndexes"; } }; @@ -291,10 +291,10 @@ class StatisticsExtractionComponentPrecomputedIndexes: } virtual void Write(std::ostream &os, bool binary) const; - + virtual void Read(std::istream &is, bool binary); - - virtual std::string Type() const { return "StaticticsExtractionComponentPrecomputedIndexes"; } + + virtual std::string Type() const { return "StatisticsExtractionComponentPrecomputedIndexes"; } private: virtual ~StatisticsExtractionComponentPrecomputedIndexes() { } }; @@ -431,12 +431,12 @@ class StatisticsPoolingComponentPrecomputedIndexes: ComponentPrecomputedIndexes *Copy() const { return new StatisticsPoolingComponentPrecomputedIndexes(*this); } - + virtual void Write(std::ostream &os, bool binary) const; - + virtual void Read(std::istream &is, bool binary); - - virtual std::string Type() const { return "StaticticsPoolingComponentPrecomputedIndexes"; } + + virtual std::string Type() const { return "StatisticsPoolingComponentPrecomputedIndexes"; } }; } // namespace nnet3 diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc index e97278f86dd..aadd0c05a1d 100644 --- a/src/nnet3/nnet-simple-component.cc +++ b/src/nnet3/nnet-simple-component.cc @@ -418,13 +418,14 @@ void NormalizeComponent::Backprop(const std::string &debug_info, in_deriv->AddDiagVecMat(1.0, in_norm, out_deriv_no_log, kNoTrans, 1.0); else in_deriv->MulRowsVec(in_norm); + in_norm.ReplaceValue(1.0 / sqrt(kSquaredNormFloor), 0.0); + in_norm.ApplyPow(3.0); + dot_products.MulElements(in_norm); + + in_deriv->AddDiagVecMat(-1.0 / d_scaled, + dot_products, in_value, + kNoTrans, 1.0); } - in_norm.ReplaceValue(1.0 / sqrt(kSquaredNormFloor), 0.0); - in_norm.ApplyPow(3.0); - dot_products.MulElements(in_norm); - in_deriv->AddDiagVecMat(-1.0 / d_scaled, - dot_products, in_value, - kNoTrans, 1.0); } void SigmoidComponent::Propagate(const ComponentPrecomputedIndexes *indexes, @@ -442,10 +443,79 @@ void SigmoidComponent::Backprop(const std::string &debug_info, CuMatrixBase *in_deriv) const { if (in_deriv != NULL) { in_deriv->DiffSigmoid(out_value, out_deriv); - RepairGradients(false, 0.025, 0.975, in_deriv); + RepairGradients(out_value, in_deriv); } } +void SigmoidComponent::RepairGradients( + const CuMatrixBase &out_value, + CuMatrixBase *in_deriv) const { + // maximum possible derivative of SigmoidComponent is 0.25. + // the default lower-threshold on the derivative, below which we + // add a term to the derivative to encourage the inputs to the sigmoid + // to be closer to zero, is 0.05, which means the derivative is on average + // 5 times smaller than its maximum possible value. + BaseFloat default_lower_threshold = 0.05; + + // we use this 'repair_probability' (hardcoded for now) to limit + // this code to running on about half of the minibatches. + BaseFloat repair_probability = 0.5; + + if (self_repair_scale_ == 0.0 || count_ == 0.0 || deriv_sum_.Dim() != dim_ || + RandUniform() > repair_probability) + return; + + // check that the self-repair scale is in a reasonable range. + KALDI_ASSERT(self_repair_scale_ > 0.0 && self_repair_scale_ < 0.1); + BaseFloat unset = kUnsetThreshold; // -1000.0 + BaseFloat lower_threshold = (self_repair_lower_threshold_ == unset ? + default_lower_threshold : + self_repair_lower_threshold_) * + count_; + if (self_repair_upper_threshold_ != unset) { + KALDI_ERR << "Do not set the self-repair-upper-threshold for sigmoid " + << "components, it does nothing."; + } + + // thresholds_vec is actually a 1-row matrix. (the ApplyHeaviside + // function isn't defined for vectors). + CuMatrix thresholds(1, dim_); + CuSubVector thresholds_vec(thresholds, 0); + thresholds_vec.AddVec(-1.0, deriv_sum_); + thresholds_vec.Add(lower_threshold); + thresholds.ApplyHeaviside(); + + // At this point, 'thresholds_vec' contains a 1 for each dimension of + // the output that is 'problematic', i.e. for which the avg-deriv + // is less than the self-repair lower threshold, and a 0 for + // each dimension that is not problematic. + + // what we want to do is to add + // -self_repair_scale_ / repair_probability times (2 * output-valiue - 1.0) + // to the input derivative for each problematic dimension. + + // Here, 2 * output - 1.0 is a version of the sigmoid that goes from -1.0 to + // 1.0, like a tanh. the negative sign is so that for inputs <0, we push them + // up towards 0, and for inputs >0, we push them down towards 0. + // Our use of this sigmoid-type function here is just a convenience since + // we have it available. We could use just about any function that is positive + // for inputs < 0 and negative for inputs > 0. + + // We can rearrange the above as: for only the problematic columns, + // input-deriv -= 2 * self-repair-scale / repair-probabilty * output + // input-deriv += self-repair-scale / repair-probabilty + // which we can write as: + // input-deriv -= 2 * self-repair-scale / repair-probabilty * output * thresholds-vec + // input-deriv += self-repair-scale / repair-probabilty * thresholds-vec + + in_deriv->AddMatDiagVec(-2.0 * self_repair_scale_ / repair_probability, + out_value, kNoTrans, thresholds_vec); + in_deriv->AddVecToCols(self_repair_scale_ / repair_probability, + thresholds_vec); +} + + + void SigmoidComponent::StoreStats(const CuMatrixBase &out_value) { // only store stats about every other minibatch. if (RandInt(0, 1) == 0) @@ -628,6 +698,68 @@ void TanhComponent::Propagate(const ComponentPrecomputedIndexes *indexes, out->Tanh(in); } + +void TanhComponent::RepairGradients( + const CuMatrixBase &out_value, + CuMatrixBase *in_deriv) const { + // maximum possible derivative of SigmoidComponent is 1.0 + // the default lower-threshold on the derivative, below which we + // add a term to the derivative to encourage the inputs to the sigmoid + // to be closer to zero, is 0.2, which means the derivative is on average + // 5 times smaller than its maximum possible value. + BaseFloat default_lower_threshold = 0.2; + + // we use this 'repair_probability' (hardcoded for now) to limit + // this code to running on about half of the minibatches. + BaseFloat repair_probability = 0.5; + + if (self_repair_scale_ == 0.0 || count_ == 0.0 || deriv_sum_.Dim() != dim_ || + RandUniform() > repair_probability) + return; + + // check that the self-repair scale is in a reasonable range. + KALDI_ASSERT(self_repair_scale_ > 0.0 && self_repair_scale_ < 0.1); + BaseFloat unset = kUnsetThreshold; // -1000.0 + BaseFloat lower_threshold = (self_repair_lower_threshold_ == unset ? + default_lower_threshold : + self_repair_lower_threshold_) * + count_; + if (self_repair_upper_threshold_ != unset) { + KALDI_ERR << "Do not set the self-repair-upper-threshold for sigmoid " + << "components, it does nothing."; + } + + // thresholds_vec is actually a 1-row matrix. (the ApplyHeaviside + // function isn't defined for vectors). + CuMatrix thresholds(1, dim_); + CuSubVector thresholds_vec(thresholds, 0); + thresholds_vec.AddVec(-1.0, deriv_sum_); + thresholds_vec.Add(lower_threshold); + thresholds.ApplyHeaviside(); + + // At this point, 'thresholds_vec' contains a 1 for each dimension of + // the output that is 'problematic', i.e. for which the avg-deriv + // is less than the self-repair lower threshold, and a 0 for + // each dimension that is not problematic. + + // what we want to do is to add -self_repair_scale_ / repair_probability times + // output-valiue) to the input derivative for each problematic dimension. + // note that for the tanh, the output-value goes from -1.0 when the input is + // -inf to +1.0 when the input is +inf. The negative sign is so that for + // inputs <0, we push them up towards 0, and for inputs >0, we push them down + // towards 0. Our use of the tanh here is just a convenience since we have it + // available. We could use just about any function that is positive for + // inputs < 0 and negative for inputs > 0. + + // We can rearrange the above as: for only the problematic columns, + // input-deriv -= self-repair-scale / repair-probabilty * output + // which we can write as: + // input-deriv -= self-repair-scale / repair-probabilty * output * thresholds-vec + + in_deriv->AddMatDiagVec(-self_repair_scale_ / repair_probability, + out_value, kNoTrans, thresholds_vec); +} + void TanhComponent::Backprop(const std::string &debug_info, const ComponentPrecomputedIndexes *indexes, const CuMatrixBase &, @@ -638,7 +770,7 @@ void TanhComponent::Backprop(const std::string &debug_info, CuMatrixBase *in_deriv) const { if (in_deriv != NULL) { in_deriv->DiffTanh(out_value, out_deriv); - RepairGradients(false, -0.95, 0.95, in_deriv); + RepairGradients(out_value, in_deriv); } } @@ -681,10 +813,66 @@ void RectifiedLinearComponent::Backprop( if (in_deriv != NULL) { in_deriv->Heaviside(out_value); in_deriv->MulElements(out_deriv); - RepairGradients(true, 0.05, 0.95, in_deriv); + RepairGradients(in_deriv); } } + +void RectifiedLinearComponent::RepairGradients( + CuMatrixBase *in_deriv) const { + BaseFloat default_lower_threshold = 0.05, + default_upper_threshold = 0.95; + // we use this 'repair_probability' (hardcoded for now) to limit + // this code to running on about half of the minibatches. + BaseFloat repair_probability = 0.5; + + if (self_repair_scale_ == 0.0 || count_ == 0.0 || deriv_sum_.Dim() != dim_ || + RandUniform() > repair_probability) + return; + + // check that the self-repair scale is in a reasonable range. + KALDI_ASSERT(self_repair_scale_ > 0.0 && self_repair_scale_ < 0.1); + BaseFloat unset = kUnsetThreshold; // -1000.0 + BaseFloat lower_threshold = (self_repair_lower_threshold_ == unset ? + default_lower_threshold : + self_repair_lower_threshold_) * + count_, + upper_threshold = (self_repair_upper_threshold_ == unset ? + default_upper_threshold : + self_repair_upper_threshold_) * + count_; + + CuMatrix storage(2, dim_ + 2, kUndefined); + CuSubVector thresholds_vec(storage.RowData(0) + dim_, 2); + CuSubMatrix stats_mat(storage, 0, 2, 0, dim_); + thresholds_vec(0) = -lower_threshold; + thresholds_vec(1) = -upper_threshold; + CuSubVector row0(stats_mat, 0); + CuSubVector row1(stats_mat, 1); + + row0.CopyFromVec(deriv_sum_); + row1.CopyFromVec(row0); + stats_mat.AddVecToCols(1.0, thresholds_vec, 1.0); + // now row0 equals stats - lower_threshold, and + // row1 equals stats - upper_threshold. + stats_mat.ApplyHeaviside(); + // now row0 equals (stats > lower_threshold ? 1 : 0), and + // row1 equals (stats > upper_threshold ? 1 : 0). + // what we want is: + // self_repair_scale * ((stats <= lower_threshold ? 1 : 0) + + // (stats > upper_threshold ? -1 : 0)). + // + // we can get these in stats_mat.Row(0) by computing: + // -self_repair_scale * (stats_mat.Row(1) + stats_mat.Row(0) - 1). + row0.AddVec(1.0, row1, 1.0); + row0.Add(-1.0); + // [actually we need to divide by repair_probability also, to + // correct for the fact that we only do this on some frames.] + row0.Scale(-self_repair_scale_ / repair_probability); + in_deriv->AddVecToRows(1.0, row0, 1.0); +} + + void RectifiedLinearComponent::StoreStats( const CuMatrixBase &out_value) { // only store stats about every other minibatch. diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h index c3a3048202f..d8295ac10e5 100644 --- a/src/nnet3/nnet-simple-component.h +++ b/src/nnet3/nnet-simple-component.h @@ -190,6 +190,11 @@ class SigmoidComponent: public NonlinearComponent { CuMatrixBase *in_deriv) const; virtual void StoreStats(const CuMatrixBase &out_value); private: + // this function is called from Backprop code and only does something if the + // self-repair-scale config value is set. + void RepairGradients(const CuMatrixBase &out_value, + CuMatrixBase *in_deriv) const; + SigmoidComponent &operator = (const SigmoidComponent &other); // Disallow. }; @@ -214,6 +219,11 @@ class TanhComponent: public NonlinearComponent { CuMatrixBase *in_deriv) const; virtual void StoreStats(const CuMatrixBase &out_value); private: + // this function is called from Backprop code and only does something if the + // self-repair-scale config value is set. + void RepairGradients(const CuMatrixBase &out_value, + CuMatrixBase *in_deriv) const; + TanhComponent &operator = (const TanhComponent &other); // Disallow. }; @@ -242,6 +252,10 @@ class RectifiedLinearComponent: public NonlinearComponent { virtual void StoreStats(const CuMatrixBase &out_value); private: + // this function is called from Backprop code and only does something if the + // self-repair-scale config value is set. + void RepairGradients(CuMatrixBase *in_deriv) const; + RectifiedLinearComponent &operator = (const RectifiedLinearComponent &other); // Disallow. }; diff --git a/src/nnet3/nnet-test-utils.cc b/src/nnet3/nnet-test-utils.cc index 5a02aa7da02..933808dc61c 100644 --- a/src/nnet3/nnet-test-utils.cc +++ b/src/nnet3/nnet-test-utils.cc @@ -942,7 +942,9 @@ static void GenerateRandomComponentConfig(std::string *component_type, BaseFloat target_rms = (RandInt(1, 200) / 100.0); std::string add_log_stddev = (Rand() % 2 == 0 ? "True" : "False"); *component_type = "NormalizeComponent"; - os << "dim=" << RandInt(1, 50) + // avoid dim=1 because the derivatives would be zero, which + // makes them hard to test. + os << "dim=" << RandInt(2, 50) << " target-rms=" << target_rms << " add-log-stddev=" << add_log_stddev; break; From 7bc34fe909a3c324349c6e88006f7fdb45adedcb Mon Sep 17 00:00:00 2001 From: Dan Povey Date: Thu, 18 Feb 2016 15:38:58 -0500 Subject: [PATCH 19/32] cosmetic change: fix 'score' to 'cost' --- src/latbin/lattice-best-path.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/latbin/lattice-best-path.cc b/src/latbin/lattice-best-path.cc index dda41cd0604..dc25fb351c6 100644 --- a/src/latbin/lattice-best-path.cc +++ b/src/latbin/lattice-best-path.cc @@ -121,7 +121,7 @@ int main(int argc, char *argv[]) { } BaseFloat tot_weight_float = tot_weight.Value1() + tot_weight.Value2(); - KALDI_LOG << "Overall score per frame is " << (tot_weight_float/n_frame) + KALDI_LOG << "Overall cost per frame is " << (tot_weight_float/n_frame) << " = " << (tot_weight.Value1()/n_frame) << " [graph]" << " + " << (tot_weight.Value2()/n_frame) << " [acoustic]" << " over " << n_frame << " frames."; From b3bbc038681b5bcd6be02481785f929b5b99e11a Mon Sep 17 00:00:00 2001 From: Dan Povey Date: Thu, 18 Feb 2016 16:58:46 -0500 Subject: [PATCH 20/32] modifying cmd.sh in example recipes to encourage the use of new-style queue options and conf/queue.conf --- egs/ami/s5/cmd.sh | 43 ++++++++++---------- egs/ami/s5/run_ihm.sh | 26 ++++++------ egs/aurora4/s5/cmd.sh | 43 ++++++++------------ egs/babel/s5/cmd.sh | 44 +++++++-------------- egs/babel/s5b/cmd.sh | 44 +++++++-------------- egs/babel/s5c/cmd.sh | 44 +++++++-------------- egs/bn_music_speech/v1/cmd.sh | 28 ++++++------- egs/callhome_egyptian/s5/cmd.sh | 33 +++++++--------- egs/chime1/s5/cmd.sh | 57 +++++++++------------------ egs/chime2/s5/cmd.sh | 44 ++++++++------------- egs/csj/s5/cmd.sh | 46 +++++++-------------- egs/farsdat/s5/cmd.sh | 40 +++++++------------ egs/fisher_callhome_spanish/s5/cmd.sh | 33 +++++++--------- egs/fisher_english/s5/cmd.sh | 44 +++++++-------------- egs/fisher_swbd/s5/cmd.sh | 15 ++----- egs/gale_arabic/s5/cmd.sh | 24 ++++++----- egs/gale_mandarin/s5/cmd.sh | 27 ++++++++----- egs/hkust/s5/cmd.sh | 26 ++++++------ egs/librispeech/s5/cmd.sh | 45 +++++++-------------- egs/lre/v1/cmd.sh | 39 ++++++------------ egs/lre07/v1/cmd.sh | 39 ++++++------------ egs/reverb/s5/cmd.sh | 44 +++++++-------------- egs/rm/s5/cmd.sh | 37 ++++++++--------- egs/sprakbanken/s5/cmd.sh | 45 +++++++-------------- egs/sre08/v1/cmd.sh | 39 ++++++------------ egs/sre10/v1/cmd.sh | 39 ++++++------------ egs/swbd/s5/cmd.sh | 40 +++++++------------ egs/swbd/s5b/cmd.sh | 43 +++++++------------- egs/swbd/s5c/cmd.sh | 32 +++++++++------ egs/thchs30/s5/cmd.sh | 3 +- egs/tidigits/s5/cmd.sh | 29 +++++++------- egs/timit/s5/cmd.sh | 47 ++++++++++------------ egs/voxforge/s5/cmd.sh | 29 +++++++------- egs/vystadial_cz/s5/cmd.sh | 38 +++++++++--------- egs/vystadial_en/s5/cmd.sh | 38 +++++++++--------- egs/wsj/s5/cmd.sh | 36 ++++++++--------- egs/wsj/s5/local/run_kl_hmm.sh | 2 + 37 files changed, 539 insertions(+), 786 deletions(-) diff --git a/egs/ami/s5/cmd.sh b/egs/ami/s5/cmd.sh index c3ac80d6846..5ec5d4b715f 100644 --- a/egs/ami/s5/cmd.sh +++ b/egs/ami/s5/cmd.sh @@ -1,9 +1,24 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 1G" +export decode_cmd="queue.pl --mem 2G" +# the use of cuda_cmd is deprecated but it is sometimes still used in nnet1 +# scripts. +export cuda_cmd="queue.pl --gpu 1 --mem 20G" + +# the rest of this file is present for historical reasons. +# In general it's best to rely on conf/queue.conf for cluster-specific +# configuration. # On Eddie use: #export train_cmd="queue.pl -P inf_hcrc_cstr_nst -l h_rt=08:00:00" @@ -11,20 +26,6 @@ #export highmem_cmd="queue.pl -P inf_hcrc_cstr_nst -l h_rt=05:00:00 -pe memory-2G 4" #export scoring_cmd="queue.pl -P inf_hcrc_cstr_nst -l h_rt=00:20:00" -# JSALT2015 workshop, cluster AWS-EC2, (setup from Vijay) -export train_cmd="queue.pl -l arch=*64* --mem 1G" -export decode_cmd="queue.pl -l arch=*64* --mem 2G" -export highmem_cmd="queue.pl -l arch=*64* --mem 4G" -export scoring_cmd="queue.pl -l arch=*64*" -export cuda_cmd="queue.pl --gpu 1 -l mem_free=20G,ram_free=20G" -export cntk_decode_cmd="queue.pl -l arch=*64* --mem 1G -pe smp 2" - -# To run locally, use: -#export train_cmd=run.pl -#export decode_cmd=run.pl -#export highmem_cmd=run.pl -#export cuda_cmd=run.pl - if [ "$(hostname -d)" == "fit.vutbr.cz" ]; then # BUT cluster: queue="all.q@@blade,all.q@@speech" @@ -33,5 +34,5 @@ if [ "$(hostname -d)" == "fit.vutbr.cz" ]; then export train_cmd="queue.pl -q $queue -l ram_free=1.5G,mem_free=1.5G,${storage}=1" export decode_cmd="queue.pl -q $queue -l ram_free=2.5G,mem_free=2.5G,${storage}=0.5" export cuda_cmd="queue.pl -q $gpu_queue -l gpu=1" -fi +fi diff --git a/egs/ami/s5/run_ihm.sh b/egs/ami/s5/run_ihm.sh index b4d41d7066a..b9d60d78182 100755 --- a/egs/ami/s5/run_ihm.sh +++ b/egs/ami/s5/run_ihm.sh @@ -10,13 +10,13 @@ mic=ihm stage=0 . utils/parse_options.sh -# Set bash to 'debug' mode, it prints the commands (option '-x') and exits on : +# Set bash to 'debug' mode, it prints the commands (option '-x') and exits on : # -e 'error', -u 'undefined variable', -o pipefail 'error in pipeline', set -euxo pipefail # Path where AMI gets downloaded (or where locally available): -AMI_DIR=$PWD/wav_db # Default, -case $(hostname -d) in +AMI_DIR=$PWD/wav_db # Default, +case $(hostname -d) in fit.vutbr.cz) AMI_DIR=/mnt/scratch05/iveselyk/KALDI_AMI_WAV ;; # BUT, clsp.jhu.edu) AMI_DIR=/export/corpora4/ami/amicorpus ;; # JHU, cstr.ed.ac.uk) AMI_DIR= ;; # Edinburgh, @@ -86,7 +86,7 @@ if [ $stage -le 5 ]; then data/$mic/train data/lang exp/$mic/tri2a exp/$mic/tri2_ali # Decode, graph_dir=exp/$mic/tri2a/graph_${LM} - $highmem_cmd $graph_dir/mkgraph.log \ + $cmd --mem 4G $graph_dir/mkgraph.log \ utils/mkgraph.sh data/lang_${LM} exp/$mic/tri2a $graph_dir steps/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.conf \ $graph_dir data/$mic/dev exp/$mic/tri2a/decode_dev_${LM} @@ -104,26 +104,26 @@ if [ $stage -le 6 ]; then data/$mic/train data/lang exp/$mic/tri3a exp/$mic/tri3a_ali # Decode, graph_dir=exp/$mic/tri3a/graph_${LM} - $highmem_cmd $graph_dir/mkgraph.log \ + $cmd --mem 4G $graph_dir/mkgraph.log \ utils/mkgraph.sh data/lang_${LM} exp/$mic/tri3a $graph_dir steps/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.conf \ - $graph_dir data/$mic/dev exp/$mic/tri3a/decode_dev_${LM} + $graph_dir data/$mic/dev exp/$mic/tri3a/decode_dev_${LM} steps/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.conf \ $graph_dir data/$mic/eval exp/$mic/tri3a/decode_eval_${LM} -fi +fi if [ $stage -le 7 ]; then # Train tri4a, which is LDA+MLLT+SAT, steps/train_sat.sh --cmd "$train_cmd" \ 5000 80000 data/$mic/train data/lang exp/$mic/tri3a_ali exp/$mic/tri4a - # Decode, + # Decode, graph_dir=exp/$mic/tri4a/graph_${LM} $highmem_cmd $graph_dir/mkgraph.log \ utils/mkgraph.sh data/lang_${LM} exp/$mic/tri4a $graph_dir steps/decode_fmllr.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.conf \ - $graph_dir data/$mic/dev exp/$mic/tri4a/decode_dev_${LM} + $graph_dir data/$mic/dev exp/$mic/tri4a/decode_dev_${LM} steps/decode_fmllr.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.conf \ - $graph_dir data/$mic/eval exp/$mic/tri4a/decode_eval_${LM} + $graph_dir data/$mic/eval exp/$mic/tri4a/decode_eval_${LM} fi nj_mmi=80 @@ -160,11 +160,11 @@ if [ $stage -le 11 ]; then decode_dir=exp/$mic/tri4a_mmi_b0.1/decode_dev_${i}.mdl_${LM} steps/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.conf \ --transform-dir exp/$mic/tri4a/decode_dev_${LM} --iter $i \ - $graph_dir data/$mic/dev $decode_dir + $graph_dir data/$mic/dev $decode_dir decode_dir=exp/$mic/tri4a_mmi_b0.1/decode_eval_${i}.mdl_${LM} steps/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.conf \ --transform-dir exp/$mic/tri4a/decode_eval_${LM} --iter $i \ - $graph_dir data/$mic/eval $decode_dir + $graph_dir data/$mic/eval $decode_dir done fi @@ -181,7 +181,7 @@ if [ $stage -le 13 ]; then --hidden-dim 950 \ --splice-indexes "layer0/-2:-1:0:1:2 layer1/-1:2 layer2/-3:3 layer3/-7:2 layer4/-3:3" \ --use-sat-alignments true - + local/online/run_nnet2_ms_sp_disc.sh \ --mic $mic \ --gmm-dir exp/$mic/tri4a \ diff --git a/egs/aurora4/s5/cmd.sh b/egs/aurora4/s5/cmd.sh index 139b2cd6c6c..378febca15b 100644 --- a/egs/aurora4/s5/cmd.sh +++ b/egs/aurora4/s5/cmd.sh @@ -1,29 +1,18 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. -#a) JHU cluster options -export train_cmd="queue.pl -l arch=*64" -export decode_cmd="queue.pl -l arch=*64 --mem 2G" -export mkgraph_cmd="queue.pl -l arch=*64 --mem 4G" -export big_memory_cmd="queue.pl -l arch=*64 --mem 8G" +export train_cmd="queue.pl --mem 4G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" +# the use of cuda_cmd is deprecated but it's still used in some example scripts +# here. export cuda_cmd="queue.pl --gpu 1" - - -#b) BUT cluster options -#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M" -#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M" -#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G" - -#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1" -#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu" -#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G" - -#c) run it locally... -#export train_cmd=run.pl -#export decode_cmd=run.pl -#export cuda_cmd=run.pl -#export mkgraph_cmd=run.pl diff --git a/egs/babel/s5/cmd.sh b/egs/babel/s5/cmd.sh index a4a11bef039..71dd849a93b 100644 --- a/egs/babel/s5/cmd.sh +++ b/egs/babel/s5/cmd.sh @@ -1,29 +1,15 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -#a) JHU cluster options -export train_cmd="queue.pl -l arch=*64" -export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G" -export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G" - -#export cuda_cmd="..." - - -#b) BUT cluster options -#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M" -#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M" -#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G" - -#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1" -#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu" -#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G" - -#c) run it locally... -#export train_cmd=run.pl -#export decode_cmd=run.pl -#export cuda_cmd=run.pl -#export mkgraph_cmd=run.pl +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/babel/s5b/cmd.sh b/egs/babel/s5b/cmd.sh index a4a11bef039..88db78823a5 100644 --- a/egs/babel/s5b/cmd.sh +++ b/egs/babel/s5b/cmd.sh @@ -1,29 +1,15 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -#a) JHU cluster options -export train_cmd="queue.pl -l arch=*64" -export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G" -export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G" - -#export cuda_cmd="..." - - -#b) BUT cluster options -#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M" -#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M" -#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G" - -#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1" -#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu" -#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G" - -#c) run it locally... -#export train_cmd=run.pl -#export decode_cmd=run.pl -#export cuda_cmd=run.pl -#export mkgraph_cmd=run.pl +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 4G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/babel/s5c/cmd.sh b/egs/babel/s5c/cmd.sh index a4a11bef039..71dd849a93b 100644 --- a/egs/babel/s5c/cmd.sh +++ b/egs/babel/s5c/cmd.sh @@ -1,29 +1,15 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -#a) JHU cluster options -export train_cmd="queue.pl -l arch=*64" -export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G" -export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G" - -#export cuda_cmd="..." - - -#b) BUT cluster options -#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M" -#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M" -#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G" - -#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1" -#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu" -#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G" - -#c) run it locally... -#export train_cmd=run.pl -#export decode_cmd=run.pl -#export cuda_cmd=run.pl -#export mkgraph_cmd=run.pl +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/bn_music_speech/v1/cmd.sh b/egs/bn_music_speech/v1/cmd.sh index 27d1d36a6a6..d1ca1a6d126 100755 --- a/egs/bn_music_speech/v1/cmd.sh +++ b/egs/bn_music_speech/v1/cmd.sh @@ -1,17 +1,15 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -#a) JHU cluster options -export train_cmd="queue.pl -l arch=*64*" - -#b) BUT cluster options -#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M" - -#c) run it locally... -#export train_cmd=run.pl +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 4G" diff --git a/egs/callhome_egyptian/s5/cmd.sh b/egs/callhome_egyptian/s5/cmd.sh index ab29f13d4cc..71dd849a93b 100755 --- a/egs/callhome_egyptian/s5/cmd.sh +++ b/egs/callhome_egyptian/s5/cmd.sh @@ -1,18 +1,15 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -#train_cmd='queue.pl -q all.q@a03.clsp.jhu.edu,all.q@a06.clsp.jhu.edu,all.q@a05.clsp.jhu.edu,all.q@v01.clsp.jhu.edu,all.q@a10.clsp.jhu.edu,all.q@a04.clsp.jhu.edu,all.q@a13.clsp.jhu.edu,all.q@a11.clsp.jhu.edu -l arch=*64' -#decode_cmd='queue.pl -q all.q@a03.clsp.jhu.edu,all.q@a06.clsp.jhu.edu,all.q@a05.clsp.jhu.edu,all.q@v01.clsp.jhu.edu,all.q@a10.clsp.jhu.edu,all.q@a04.clsp.jhu.edu,all.q@a13.clsp.jhu.edu,all.q@a11.clsp.jhu.edu -l arch=*64' -train_cmd="queue.pl -l arch=*64" -decode_cmd="queue.pl -l arch=*64" -#train_cmd="run.pl" -# Do training locally. Note: for jobs on smallish subsets, -# it's way faster to run on a single machine with a handful of CPUs, as -# you avoid the latency of starting GridEngine jobs. - - - +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/chime1/s5/cmd.sh b/egs/chime1/s5/cmd.sh index dda6226f419..0dcd5a9200f 100755 --- a/egs/chime1/s5/cmd.sh +++ b/egs/chime1/s5/cmd.sh @@ -1,39 +1,18 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -#a) JHU cluster options -#export train_cmd="queue.pl -l arch=*64" -#export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G" -#export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G" - -#export cuda_cmd="..." - - -#b) BUT cluster options -#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M" -#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M" -#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G" - -#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1" -#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu" -#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G" - - -#c) USFD cluster options -#config="conf/queue_usfd.conf" -#export train_cmd="queue.pl --config $config --mem 8G --rmem 4G" -#export decode_cmd="queue.pl --config $config --mem 8G --rmem 4G" -#export mkgraph_cmd="queue.pl --config $config --mem 8G --rmem 4G" -#export cuda_cmd="queue.pl --config $config --mem 24G --rmem 20G --gpu 1 --time 24:00:00" - - -#d) run it locally... -export train_cmd=run.pl -export decode_cmd=run.pl -export cuda_cmd=run.pl -export mkgraph_cmd=run.pl - +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" + +# the use of cuda_cmd is deprecated, but it's still used in this recipe. +export cuda_cmd="queue.pl --gpu 1" diff --git a/egs/chime2/s5/cmd.sh b/egs/chime2/s5/cmd.sh index 8bb00fe0ec6..0dcd5a9200f 100644 --- a/egs/chime2/s5/cmd.sh +++ b/egs/chime2/s5/cmd.sh @@ -1,30 +1,18 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. -#a) JHU cluster options -export train_cmd="queue.pl -l arch=*64" -export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G" -export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G" -export big_memory_cmd="queue.pl -l arch=*64,ram_free=8G,mem_free=8G" -export cuda_cmd="queue.pl -l gpu=1" -#export cuda_cmd="..." +export train_cmd="queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" - -#b) BUT cluster options -#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M" -#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M" -#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G" - -#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1" -#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu" -#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G" - -#c) run it locally... -#export train_cmd=run.pl -#export decode_cmd=run.pl -export cuda_cmd=run.pl -#export mkgraph_cmd=run.pl +# the use of cuda_cmd is deprecated, but it's still used in this recipe. +export cuda_cmd="queue.pl --gpu 1" diff --git a/egs/csj/s5/cmd.sh b/egs/csj/s5/cmd.sh index d5952fe0f87..71dd849a93b 100644 --- a/egs/csj/s5/cmd.sh +++ b/egs/csj/s5/cmd.sh @@ -1,31 +1,15 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -#a) JHU cluster options -#export train_cmd="queue.pl -l arch=*64*" -#export decode_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G" -export train_cmd="run.pl" -export decode_cmd="run.pl" -#export cuda_cmd="..." -#export mkgraph_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G" -export mkgraph_cmd="run.pl" - -#b) BUT cluster options -#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M" -#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M" -#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G" -#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1" -#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu" -#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G" - -#c) run it locally... -#export train_cmd=run.pl -#export decode_cmd=run.pl -export cuda_cmd=run.pl -#export mkgraph_cmd=run.pl - - +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/farsdat/s5/cmd.sh b/egs/farsdat/s5/cmd.sh index d749f2c9f1f..71dd849a93b 100644 --- a/egs/farsdat/s5/cmd.sh +++ b/egs/farsdat/s5/cmd.sh @@ -1,25 +1,15 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -#a) JHU cluster options -export train_cmd="queue.pl -l arch=*64" -export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G" -export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G" -export cuda_cmd="run.pl" - - -#b) BUT cluster options -#export train_cmd="queue.pl -q all.q@blade[01][0126789][123456789] -l ram_free=2500M,mem_free=2500M,matylda5=0.5" -#export decode_cmd="queue.pl -q all.q@blade[01][0126789][123456789] -l ram_free=3000M,mem_free=3000M,matylda5=0.1" -#export mkgraph_cmd="queue.pl -q all.q@blade[01][0126789][123456789] -l ram_free=4G,mem_free=4G,matylda5=3" -#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu,long.q@dellgpu*,long.q@pco203-0[0124] -l gpu=1" - -#c) run locally... -#export train_cmd=run.pl -#export decode_cmd=run.pl -#export cuda_cmd=run.pl -#export mkgraph_cmd=run.pl +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/fisher_callhome_spanish/s5/cmd.sh b/egs/fisher_callhome_spanish/s5/cmd.sh index ab29f13d4cc..88db78823a5 100755 --- a/egs/fisher_callhome_spanish/s5/cmd.sh +++ b/egs/fisher_callhome_spanish/s5/cmd.sh @@ -1,18 +1,15 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -#train_cmd='queue.pl -q all.q@a03.clsp.jhu.edu,all.q@a06.clsp.jhu.edu,all.q@a05.clsp.jhu.edu,all.q@v01.clsp.jhu.edu,all.q@a10.clsp.jhu.edu,all.q@a04.clsp.jhu.edu,all.q@a13.clsp.jhu.edu,all.q@a11.clsp.jhu.edu -l arch=*64' -#decode_cmd='queue.pl -q all.q@a03.clsp.jhu.edu,all.q@a06.clsp.jhu.edu,all.q@a05.clsp.jhu.edu,all.q@v01.clsp.jhu.edu,all.q@a10.clsp.jhu.edu,all.q@a04.clsp.jhu.edu,all.q@a13.clsp.jhu.edu,all.q@a11.clsp.jhu.edu -l arch=*64' -train_cmd="queue.pl -l arch=*64" -decode_cmd="queue.pl -l arch=*64" -#train_cmd="run.pl" -# Do training locally. Note: for jobs on smallish subsets, -# it's way faster to run on a single machine with a handful of CPUs, as -# you avoid the latency of starting GridEngine jobs. - - - +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 4G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/fisher_english/s5/cmd.sh b/egs/fisher_english/s5/cmd.sh index a4a11bef039..88db78823a5 100644 --- a/egs/fisher_english/s5/cmd.sh +++ b/egs/fisher_english/s5/cmd.sh @@ -1,29 +1,15 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -#a) JHU cluster options -export train_cmd="queue.pl -l arch=*64" -export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G" -export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G" - -#export cuda_cmd="..." - - -#b) BUT cluster options -#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M" -#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M" -#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G" - -#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1" -#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu" -#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G" - -#c) run it locally... -#export train_cmd=run.pl -#export decode_cmd=run.pl -#export cuda_cmd=run.pl -#export mkgraph_cmd=run.pl +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 4G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/fisher_swbd/s5/cmd.sh b/egs/fisher_swbd/s5/cmd.sh index ca31c61d256..88db78823a5 100644 --- a/egs/fisher_swbd/s5/cmd.sh +++ b/egs/fisher_swbd/s5/cmd.sh @@ -1,19 +1,12 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - # you can change cmd.sh depending on what type of queue you are using. # If you have no queueing system and want to run on a local machine, you -# can change all instances 'queue.pl' to run.pl (but be careful and run +# can change all instances 'queue.pl' to run.pl (but be careful and run # commands one by one: most recipes will exhaust the memory on your -# machine). queue.pl works with GridEngine (qsub). slurm.pl works -# with slurm. Different queues are configured differently, with different +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different # queue names and different ways of specifying things like memory; # to account for these differences you can create and edit the file -# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf to match your queue's configuration. Search for # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. diff --git a/egs/gale_arabic/s5/cmd.sh b/egs/gale_arabic/s5/cmd.sh index 6e2777b595b..71dd849a93b 100755 --- a/egs/gale_arabic/s5/cmd.sh +++ b/egs/gale_arabic/s5/cmd.sh @@ -1,11 +1,15 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -export train_cmd="queue.pl -l 'arch=*64*'" -export decode_cmd="queue.pl -l 'arch=*64*'" -export cuda_cmd="queue.pl -l gpu=1" +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. +export train_cmd="queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/gale_mandarin/s5/cmd.sh b/egs/gale_mandarin/s5/cmd.sh index 6e2777b595b..2d51ad82004 100755 --- a/egs/gale_mandarin/s5/cmd.sh +++ b/egs/gale_mandarin/s5/cmd.sh @@ -1,11 +1,18 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -export train_cmd="queue.pl -l 'arch=*64*'" -export decode_cmd="queue.pl -l 'arch=*64*'" -export cuda_cmd="queue.pl -l gpu=1" +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. +export train_cmd="queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" +# the use of cuda_cmd is deprecated, but it's still used in this example +# directory. +export cuda_cmd="queue.pl --gpu 1" diff --git a/egs/hkust/s5/cmd.sh b/egs/hkust/s5/cmd.sh index 2a46d89f385..71dd849a93b 100644 --- a/egs/hkust/s5/cmd.sh +++ b/egs/hkust/s5/cmd.sh @@ -1,13 +1,15 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -export train_cmd="queue.pl -q all.q@a*.clsp.jhu.edu" -export decode_cmd="queue.pl -q all.q@a*.clsp.jhu.edu" -#export train_cmd=run.pl -#export decode_cmd=run.pl - +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. +export train_cmd="queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/librispeech/s5/cmd.sh b/egs/librispeech/s5/cmd.sh index 6395d96ca36..71dd849a93b 100644 --- a/egs/librispeech/s5/cmd.sh +++ b/egs/librispeech/s5/cmd.sh @@ -1,30 +1,15 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -#a) JHU cluster options -export train_cmd="queue.pl -l arch=*64" -export decode_cmd="queue.pl -l arch=*64 --mem 2G" -export mkgraph_cmd="queue.pl -l arch=*64 --mem 4G" -export big_memory_cmd="queue.pl -l arch=*64 --mem 8G" -export cuda_cmd="queue.pl -l gpu=1" - - - -#b) BUT cluster options -#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M" -#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M" -#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G" - -#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1" -#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu" -#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G" - -#c) run it locally... -#export train_cmd=run.pl -#export decode_cmd=run.pl -#export cuda_cmd=run.pl -#export mkgraph_cmd=run.pl +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/lre/v1/cmd.sh b/egs/lre/v1/cmd.sh index 5c38b3a5d77..d1ca1a6d126 100644 --- a/egs/lre/v1/cmd.sh +++ b/egs/lre/v1/cmd.sh @@ -1,28 +1,15 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -#a) JHU cluster options -export train_cmd="queue.pl -l arch=*64*" -export decode_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G" -#export cuda_cmd="..." -export mkgraph_cmd="queue.pl -l arch=*64* ram_free=4G,mem_free=4G" - -#b) BUT cluster options -#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M" -#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M" -#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G" -#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1" -#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu" -#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G" - -#c) run it locally... -#export train_cmd=run.pl -#export decode_cmd=run.pl -export cuda_cmd=run.pl -#export mkgraph_cmd=run.pl +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 4G" diff --git a/egs/lre07/v1/cmd.sh b/egs/lre07/v1/cmd.sh index 5c38b3a5d77..d1ca1a6d126 100644 --- a/egs/lre07/v1/cmd.sh +++ b/egs/lre07/v1/cmd.sh @@ -1,28 +1,15 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -#a) JHU cluster options -export train_cmd="queue.pl -l arch=*64*" -export decode_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G" -#export cuda_cmd="..." -export mkgraph_cmd="queue.pl -l arch=*64* ram_free=4G,mem_free=4G" - -#b) BUT cluster options -#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M" -#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M" -#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G" -#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1" -#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu" -#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G" - -#c) run it locally... -#export train_cmd=run.pl -#export decode_cmd=run.pl -export cuda_cmd=run.pl -#export mkgraph_cmd=run.pl +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 4G" diff --git a/egs/reverb/s5/cmd.sh b/egs/reverb/s5/cmd.sh index e88b07e1195..71dd849a93b 100644 --- a/egs/reverb/s5/cmd.sh +++ b/egs/reverb/s5/cmd.sh @@ -1,29 +1,15 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -#a) JHU cluster options -export train_cmd="queue.pl -l arch=*64,gpu=1 -q g.q" -export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G" -export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G" - -#export cuda_cmd="..." - - -#b) BUT cluster options -#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M" -#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M" -#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G" - -#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1" -#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu" -#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G" - -#c) run it locally... -#export train_cmd=run.pl -#export decode_cmd=run.pl -#export cuda_cmd=run.pl -#export mkgraph_cmd=run.pl +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/rm/s5/cmd.sh b/egs/rm/s5/cmd.sh index 4d009813fd2..6e2f3e9ee48 100644 --- a/egs/rm/s5/cmd.sh +++ b/egs/rm/s5/cmd.sh @@ -1,23 +1,24 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. -train_cmd="queue.pl -l arch=*64" -decode_cmd="queue.pl -l arch=*64" +export train_cmd=queue.pl +export decode_cmd=queue.pl +export mkgraph_cmd=queue.pl +export cuda_cmd="queue.pl --gpu 1" -# cuda_cmd is used for nnet1 scripts e.g. local/run_dnn.sh, but -# in the nnet2 scripts e.g. local/run_nnet2.sh, this is not -# used and we append options to train_cmd. -cuda_cmd="queue.pl -l arch=*64 -l gpu=1" - -#train_cmd="run.pl" -# with run.pl we do training locally. Note: for jobs on smallish subsets, -# it's way faster to run on a single machine with a handful of CPUs, as -# you avoid the latency of starting GridEngine jobs. +# The rest of this file is here for historical reasons. For cluster-specific +# configuration it's generally better to use conf/queue.conf, see +# http://kaldi-asr.org/doc/queue.html. # BUT cluster: if [ "$(hostname -d)" == "fit.vutbr.cz" ]; then @@ -26,5 +27,5 @@ if [ "$(hostname -d)" == "fit.vutbr.cz" ]; then storage="matylda5" export train_cmd="queue.pl -q $queue -l ram_free=1500M,mem_free=1500M,${storage}=1" export decode_cmd="queue.pl -q $queue -l ram_free=2500M,mem_free=2500M,${storage}=0.5" - export cuda_cmd="queue.pl -q $gpu_queue -l gpu=1" + export cuda_cmd="queue.pl -q $gpu_queue -l gpu=1" fi diff --git a/egs/sprakbanken/s5/cmd.sh b/egs/sprakbanken/s5/cmd.sh index 43867ccf0d9..71dd849a93b 100644 --- a/egs/sprakbanken/s5/cmd.sh +++ b/egs/sprakbanken/s5/cmd.sh @@ -1,30 +1,15 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -#a) JHU cluster options -#export train_cmd="queue.pl -l arch=*64" -#export decode_cmd="queue.pl -l arch=*64 --mem 2G" -#export mkgraph_cmd="queue.pl -l arch=*64 --mem 2G" -#export big_memory_cmd="queue.pl -l arch=*64 --mem 2G" -#export cuda_cmd="queue.pl -l gpu=1" - - - -#b) BUT cluster options -#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M" -#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M" -#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G" - -#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1" -#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu" -#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G" - -#c) run it locally... -export train_cmd=run.pl -export decode_cmd=run.pl -export cuda_cmd=run.pl -export mkgraph_cmd=run.pl +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/sre08/v1/cmd.sh b/egs/sre08/v1/cmd.sh index 5c38b3a5d77..d1ca1a6d126 100644 --- a/egs/sre08/v1/cmd.sh +++ b/egs/sre08/v1/cmd.sh @@ -1,28 +1,15 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -#a) JHU cluster options -export train_cmd="queue.pl -l arch=*64*" -export decode_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G" -#export cuda_cmd="..." -export mkgraph_cmd="queue.pl -l arch=*64* ram_free=4G,mem_free=4G" - -#b) BUT cluster options -#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M" -#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M" -#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G" -#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1" -#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu" -#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G" - -#c) run it locally... -#export train_cmd=run.pl -#export decode_cmd=run.pl -export cuda_cmd=run.pl -#export mkgraph_cmd=run.pl +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 4G" diff --git a/egs/sre10/v1/cmd.sh b/egs/sre10/v1/cmd.sh index 5c38b3a5d77..d1ca1a6d126 100755 --- a/egs/sre10/v1/cmd.sh +++ b/egs/sre10/v1/cmd.sh @@ -1,28 +1,15 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -#a) JHU cluster options -export train_cmd="queue.pl -l arch=*64*" -export decode_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G" -#export cuda_cmd="..." -export mkgraph_cmd="queue.pl -l arch=*64* ram_free=4G,mem_free=4G" - -#b) BUT cluster options -#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M" -#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M" -#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G" -#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1" -#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu" -#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G" - -#c) run it locally... -#export train_cmd=run.pl -#export decode_cmd=run.pl -export cuda_cmd=run.pl -#export mkgraph_cmd=run.pl +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 4G" diff --git a/egs/swbd/s5/cmd.sh b/egs/swbd/s5/cmd.sh index 4abf8546b0d..bae7f5cdf45 100644 --- a/egs/swbd/s5/cmd.sh +++ b/egs/swbd/s5/cmd.sh @@ -1,28 +1,16 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -#a) JHU cluster options -export train_cmd="queue.pl -l arch=*64*" -export decode_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G" -#export cuda_cmd="..." -export mkgraph_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G" - -#b) BUT cluster options -#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M" -#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M" -#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G" -#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1" -#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu" -#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G" - -#c) run it locally... -#export train_cmd=run.pl -#export decode_cmd=run.pl -export cuda_cmd=run.pl -#export mkgraph_cmd=run.pl +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. +export train_cmd="queue.pl" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/swbd/s5b/cmd.sh b/egs/swbd/s5b/cmd.sh index 4abf8546b0d..575407ac0ff 100644 --- a/egs/swbd/s5b/cmd.sh +++ b/egs/swbd/s5b/cmd.sh @@ -1,28 +1,15 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -#a) JHU cluster options -export train_cmd="queue.pl -l arch=*64*" -export decode_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G" -#export cuda_cmd="..." -export mkgraph_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G" - -#b) BUT cluster options -#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M" -#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M" -#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G" -#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1" -#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu" -#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G" - -#c) run it locally... -#export train_cmd=run.pl -#export decode_cmd=run.pl -export cuda_cmd=run.pl -#export mkgraph_cmd=run.pl - - +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/swbd/s5c/cmd.sh b/egs/swbd/s5c/cmd.sh index 3dfaceaafab..c5a71711617 100644 --- a/egs/swbd/s5c/cmd.sh +++ b/egs/swbd/s5c/cmd.sh @@ -1,17 +1,23 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. -# Default opts, -export train_cmd="queue.pl -l arch=*64*" -export decode_cmd="queue.pl -l arch=*64* --mem 4G" -export cuda_cmd=run.pl # Run on local machine, -export mkgraph_cmd="queue.pl -l arch=*64* --mem 4G" +export train_cmd="queue.pl" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" +export cuda_cmd="queue.pl --gpu 1" -# BUT options, + +# the rest of this file is present for historical reasons. it's better to +# create and edit conf/queue.conf for cluster-specific configuration. if [ "$(hostname -d)" == "fit.vutbr.cz" ]; then # BUT cluster: queue="all.q@@blade,all.q@@speech" @@ -20,5 +26,5 @@ if [ "$(hostname -d)" == "fit.vutbr.cz" ]; then export train_cmd="queue.pl -q $queue -l ram_free=1.5G,mem_free=1.5G,${storage}=0.25" export decode_cmd="queue.pl -q $queue -l ram_free=2.5G,mem_free=2.5G,${storage}=0.1" export cuda_cmd="queue.pl -q $gpu_queue -l gpu=1" -fi +fi diff --git a/egs/thchs30/s5/cmd.sh b/egs/thchs30/s5/cmd.sh index 6d9fe9c0fb2..1d8e768790f 100644 --- a/egs/thchs30/s5/cmd.sh +++ b/egs/thchs30/s5/cmd.sh @@ -1,6 +1,6 @@ # you can change cmd.sh depending on what type of queue you are using. # If you have no queueing system and want to run on a local machine, you -# can change all instances 'queue.pl' to run.pl (but be careful and run +# can change all instances 'queue.pl' to run.pl (but be careful and run # commands one by one: most recipes will exhaust the memory on your # machine). queue.pl works with GridEngine (qsub). slurm.pl works # with slurm. Different queues are configured differently, with different @@ -13,4 +13,3 @@ export train_cmd=queue.pl export decode_cmd="queue.pl --mem 4G" export mkgraph_cmd="queue.pl --mem 8G" -export cuda_cmd="$train_cmd --gpu 1" diff --git a/egs/tidigits/s5/cmd.sh b/egs/tidigits/s5/cmd.sh index c8f0d9d67a7..71dd849a93b 100644 --- a/egs/tidigits/s5/cmd.sh +++ b/egs/tidigits/s5/cmd.sh @@ -1,14 +1,15 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -#export train_cmd="queue.pl -q all.q@a*.clsp.jhu.edu" -export decode_cmd="queue.pl -q all.q@a*.clsp.jhu.edu" -export train_cmd=run.pl -#export decode_cmd=run.pl - - - +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/timit/s5/cmd.sh b/egs/timit/s5/cmd.sh index 0150f486298..5abbfd4495a 100644 --- a/egs/timit/s5/cmd.sh +++ b/egs/timit/s5/cmd.sh @@ -1,36 +1,31 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. -#a) JHU cluster options -#export train_cmd="queue.pl -l arch=*64" -#export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G" -#export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G" -#export cuda_cmd=run.pl +export train_cmd="queue.pl --mem 4G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" +# the use of cuda_cmd is deprecated but it's still sometimes used in nnet1 +# example scripts. +export cuda_cmd="queue.pl --gpu 1" - -if [ "$(hostname -d)" == "clsp.jhu.edu" ]; then - export train_cmd="queue.pl -l arch=*64*" - export decode_cmd="queue.pl -l arch=*64* --mem 3G" - export cuda_cmd="queue.pl -l gpu=1" -elif [ "$(hostname -d)" == "fit.vutbr.cz" ]; then +# the rest of this file is present for historical reasons. +# for cluster-specific configuration it's better to rely on conf/queue.conf. +if [ "$(hostname -d)" == "fit.vutbr.cz" ]; then #b) BUT cluster options queue="all.q@@blade,all.q@@speech" gpu_queue="long.q@@gpu" storage="matylda5" export train_cmd="queue.pl -q $queue -l ram_free=1.5G,mem_free=1.5G,${storage}=0.5" export decode_cmd="queue.pl -q $queue -l ram_free=2.5G,mem_free=2.5G,${storage}=0.1" - export cuda_cmd="queue.pl -q $gpu_queue -l gpu=1" -else - echo "$0: you need to define options for your cluster." - exit 1; + export cuda_cmd="queue.pl -q $gpu_queue -l gpu=1" fi -#c) run locally... -#export train_cmd=run.pl -#export decode_cmd=run.pl -#export cuda_cmd=run.pl -#export mkgraph_cmd=run.pl diff --git a/egs/voxforge/s5/cmd.sh b/egs/voxforge/s5/cmd.sh index 2d454050669..71dd849a93b 100644 --- a/egs/voxforge/s5/cmd.sh +++ b/egs/voxforge/s5/cmd.sh @@ -1,14 +1,15 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. - -#export train_cmd="queue.pl -q all.q@a*.clsp.jhu.edu" -#export decode_cmd="queue.pl -q all.q@a*.clsp.jhu.edu" -export train_cmd=run.pl -export decode_cmd=run.pl - - - +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export train_cmd="queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" diff --git a/egs/vystadial_cz/s5/cmd.sh b/egs/vystadial_cz/s5/cmd.sh index 0900744b5ae..bb0b5337cdb 100644 --- a/egs/vystadial_cz/s5/cmd.sh +++ b/egs/vystadial_cz/s5/cmd.sh @@ -1,22 +1,20 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. -#export train_cmd="queue.pl -q all.q@a*.clsp.jhu.edu" -#export decode_cmd="queue.pl -q all.q@a*.clsp.jhu.edu" -# export train_cmd="queue.pl -l mf=5g" -# export decode_cmd="queue.pl -l mf=5g" -export train_cmd="queue.pl -l arch=*64*" -export decode_cmd="queue.pl -l arch=*64*" +export train_cmd="queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" -# The number of parallel jobs to be started for some parts of the recipe -# Make sure you have enough resources(CPUs and RAM) to accomodate this number of jobs -njobs=20 - -# If you have no GridEngine you can do: -#export train_cmd=run.pl -#export decode_cmd=run.pl -#njobs=2 +# this controls the number of parallel decoding jobs launched in run.sh if you +# are running locally (e.g. with run.pl) you can reduce it to control memory +# usage. +export njobs=20 diff --git a/egs/vystadial_en/s5/cmd.sh b/egs/vystadial_en/s5/cmd.sh index 0900744b5ae..bb0b5337cdb 100644 --- a/egs/vystadial_en/s5/cmd.sh +++ b/egs/vystadial_en/s5/cmd.sh @@ -1,22 +1,20 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. -#export train_cmd="queue.pl -q all.q@a*.clsp.jhu.edu" -#export decode_cmd="queue.pl -q all.q@a*.clsp.jhu.edu" -# export train_cmd="queue.pl -l mf=5g" -# export decode_cmd="queue.pl -l mf=5g" -export train_cmd="queue.pl -l arch=*64*" -export decode_cmd="queue.pl -l arch=*64*" +export train_cmd="queue.pl --mem 2G" +export decode_cmd="queue.pl --mem 4G" +export mkgraph_cmd="queue.pl --mem 8G" -# The number of parallel jobs to be started for some parts of the recipe -# Make sure you have enough resources(CPUs and RAM) to accomodate this number of jobs -njobs=20 - -# If you have no GridEngine you can do: -#export train_cmd=run.pl -#export decode_cmd=run.pl -#njobs=2 +# this controls the number of parallel decoding jobs launched in run.sh if you +# are running locally (e.g. with run.pl) you can reduce it to control memory +# usage. +export njobs=20 diff --git a/egs/wsj/s5/cmd.sh b/egs/wsj/s5/cmd.sh index 96c48af42c1..537c46ba4f2 100644 --- a/egs/wsj/s5/cmd.sh +++ b/egs/wsj/s5/cmd.sh @@ -1,23 +1,23 @@ -# "queue.pl" uses qsub. The options to it are -# options to qsub. If you have GridEngine installed, -# change this to a queue you have access to. -# Otherwise, use "run.pl", which will run jobs locally -# (make sure your --num-jobs options are no more than -# the number of cpus on your machine. +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. -#a) JHU cluster options -export train_cmd="queue.pl -l arch=*64" -export decode_cmd="queue.pl -l arch=*64 --mem 2G" -export mkgraph_cmd="queue.pl -l arch=*64 --mem 4G" -export big_memory_cmd="queue.pl -l arch=*64 --mem 8G" -export cuda_cmd="queue.pl -l gpu=1" - -#b) run it locally... -#export train_cmd=run.pl -#export decode_cmd=run.pl -#export cuda_cmd=run.pl -#export mkgraph_cmd=run.pl +export train_cmd=queue.pl +export decode_cmd="queue.pl --mem 2G" +export mkgraph_cmd="queue.pl --mem 4G" +# the use of cuda_cmd is deprecated. +export cuda_cmd="queue.pl --gpu 1" +# the rest of this file is present for historical reasons. +# It's better to use conf/queue.conf for cluster-specific configuration. #c) BUT cluster: if [ "$(hostname -d)" == "fit.vutbr.cz" ]; then queue="all.q@@blade,all.q@@speech" diff --git a/egs/wsj/s5/local/run_kl_hmm.sh b/egs/wsj/s5/local/run_kl_hmm.sh index 9e7679a7675..efe95052c1d 100644 --- a/egs/wsj/s5/local/run_kl_hmm.sh +++ b/egs/wsj/s5/local/run_kl_hmm.sh @@ -5,6 +5,8 @@ . cmd.sh +big_memory_cmd="$decode_cmd --mem 8G" + states=20000 dir=exp/tri4b_pretrain-dbn_dnn/ From dbb028fb6188dcded7118b2650a8281d6a6fc4fe Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Fri, 19 Feb 2016 00:51:26 -0500 Subject: [PATCH 21/32] clarifying configuration process for CUDA, and give prototype Makefiles more meaningful names --- src/configure | 38 ++++++++++--------- .../{linux_cuda.mk => cuda_32bit.mk} | 4 +- .../{linux_x86_64_cuda.mk => cuda_64bit.mk} | 9 +---- 3 files changed, 23 insertions(+), 28 deletions(-) rename src/makefiles/{linux_cuda.mk => cuda_32bit.mk} (83%) rename src/makefiles/{linux_x86_64_cuda.mk => cuda_64bit.mk} (55%) diff --git a/src/configure b/src/configure index 2695859de84..0f6577dde17 100755 --- a/src/configure +++ b/src/configure @@ -403,11 +403,11 @@ function linux_configure_mkl_threading { } ## -##CUDA is used in src/cudamatrix and src/nnet{,bin} only. -##It is used to accelerate the neural network training, -##the rest of kaldi is running on CPUs. +## CUDA is used only in selected directories including src/cudamatrix, src/nnet* +## and src/chain*. It is used to accelerate the neural network training, the +## rest of kaldi runs on CPUs. ## -function linux_configure_cuda { +function configure_cuda { #check for CUDA toolkit in the system if [ ! $CUDATKDIR ]; then for base in /Developer/NVIDIA/CUDA-6.0 /usr/local/share/cuda /usr/local/cuda /pkgs_local/cuda-3.2/ /opt/nvidia_cuda/cuda-6.0/ /usr/; do @@ -428,9 +428,13 @@ function linux_configure_cuda { echo CUDATKDIR = $CUDATKDIR >> kaldi.mk if [ "`uname -m`" == "x86_64" ]; then - cat makefiles/linux_x86_64_cuda.mk >> kaldi.mk + if [ "`uname`" == "Darwin" ]; then + sed 's/lib64/lib/g' < makefiles/cuda_64bit.mk >> kaldi.mk + else + cat makefiles/cuda_64bit.mk >> kaldi.mk + fi else - cat makefiles/linux_cuda.mk >> kaldi.mk + cat makefiles/cuda_32bit.mk >> kaldi.mk fi else echo "CUDA will not be used! If you have already installed cuda drivers " @@ -541,7 +545,7 @@ function linux_configure_debian_ubuntu { cat makefiles/linux_atlas.mk >> kaldi.mk fix_cxx_flag echo "Successfully configured for Debian/Ubuntu Linux [dynamic libraries] with ATLASLIBS =$ATLASLIBS" - $use_cuda && linux_configure_cuda + $use_cuda && configure_cuda linux_configure_speex exit_success; } @@ -560,7 +564,7 @@ function linux_configure_debian_ubuntu3 { cat makefiles/linux_atlas.mk >> kaldi.mk fix_cxx_flag echo "Successfully configured for Debian/Ubuntu Linux [dynamic libraries] with ATLASLIBS =$ATLASLIBS" - $use_cuda && linux_configure_cuda + $use_cuda && configure_cuda linux_configure_speex exit_success; } @@ -582,7 +586,7 @@ function linux_configure_debian7 { cat makefiles/linux_atlas.mk >> kaldi.mk fix_cxx_flag echo "Successfully configured for Debian 7 [dynamic libraries] with ATLASLIBS =$ATLASLIBS" - $use_cuda && linux_configure_cuda + $use_cuda && configure_cuda linux_configure_speex exit_success; } @@ -601,7 +605,7 @@ function linux_configure_redhat { cat makefiles/linux_atlas.mk >> kaldi.mk fix_cxx_flag echo "Successfully configured for red hat [dynamic libraries] with ATLASLIBS =$ATLASLIBS" - $use_cuda && linux_configure_cuda + $use_cuda && configure_cuda exit_success; } @@ -622,7 +626,7 @@ function linux_configure_redhat_fat { cat makefiles/linux_atlas.mk >> kaldi.mk fix_cxx_flag echo "Successfully configured for red hat [dynamic libraries, fat] with ATLASLIBS =$ATLASLIBS" - $use_cuda && linux_configure_cuda + $use_cuda && configure_cuda exit_success; } @@ -674,7 +678,7 @@ function linux_configure_static { echo ATLASLIBS = $ATLASLIBS >> kaldi.mk cat makefiles/linux_atlas.mk >> kaldi.mk fix_cxx_flag - $use_cuda && linux_configure_cuda + $use_cuda && configure_cuda linux_configure_speex echo "Successfully configured for Linux [static libraries] with ATLASLIBS =$ATLASLIBS" exit_success; @@ -753,7 +757,7 @@ function linux_configure_dynamic { echo ATLASLIBS = $ATLASLIBS >> kaldi.mk cat makefiles/linux_atlas.mk >> kaldi.mk fix_cxx_flag - $use_cuda && linux_configure_cuda + $use_cuda && configure_cuda linux_configure_speex echo "Successfully configured for Linux [dynamic libraries] with ATLASLIBS =$ATLASLIBS" exit_success; @@ -813,7 +817,7 @@ echo "Doing OS specific configurations ..." # which crashes on Darwin. Also the linear algebra libraries on Macs are # used differently (through the Accelerate framework) than on Linux. if [ "`uname`" == "Darwin" ]; then - $use_cuda && linux_configure_cuda + $use_cuda && configure_cuda echo "On Darwin: checking for Accelerate framework ..." if [ ! -e /System/Library/Frameworks/Accelerate.framework ]; then failure "Need the Accelerate.framework to compile on Darwin." @@ -973,7 +977,7 @@ if [ "`uname`" == "Linux" ]; then fix_cxx_flag echo "MKLFLAGS = ${MKL_LINK_LINE} ${THREADING_LINE} $EXTRA_LIBS " >> kaldi.mk - $use_cuda && linux_configure_cuda + $use_cuda && configure_cuda linux_configure_speex echo "Successfully configured for Linux with MKL libs from $MKLROOT" exit_success; @@ -996,7 +1000,7 @@ if [ "`uname`" == "Linux" ]; then cat makefiles/linux_clapack.mk >> kaldi.mk fix_cxx_flag echo "Warning (CLAPACK): this part of the configure process is not properly tested and will not work." - $use_cuda && linux_configure_cuda + $use_cuda && configure_cuda linux_configure_speex echo "Successfully configured for Linux with CLAPACK libs from $CLAPACKROOT" exit_success; @@ -1020,7 +1024,7 @@ if [ "`uname`" == "Linux" ]; then echo "OPENBLASROOT = $OPENBLASROOT" >> kaldi.mk cat makefiles/linux_openblas.mk >> kaldi.mk fix_cxx_flag - $use_cuda && linux_configure_cuda + $use_cuda && configure_cuda linux_configure_speex echo "Successfully configured OpenBLAS from $OPENBLASROOT." exit_success; diff --git a/src/makefiles/linux_cuda.mk b/src/makefiles/cuda_32bit.mk similarity index 83% rename from src/makefiles/linux_cuda.mk rename to src/makefiles/cuda_32bit.mk index 502bf0ffc03..c89bf2e409d 100644 --- a/src/makefiles/linux_cuda.mk +++ b/src/makefiles/cuda_32bit.mk @@ -1,8 +1,6 @@ CUDA_INCLUDE= -I$(CUDATKDIR)/include CUDA_FLAGS = -g -Xcompiler -fPIC --verbose --machine 32 -DHAVE_CUDA - -CXXFLAGS += -DHAVE_CUDA -I$(CUDATKDIR)/include +CXXFLAGS += -DHAVE_CUDA -I$(CUDATKDIR)/include LDFLAGS += -L$(CUDATKDIR)/lib -Wl,-rpath=$(CUDATKDIR)/lib LDLIBS += -lcublas -lcudart #LDLIBS : The libs are loaded later than static libs in implicit rule - diff --git a/src/makefiles/linux_x86_64_cuda.mk b/src/makefiles/cuda_64bit.mk similarity index 55% rename from src/makefiles/linux_x86_64_cuda.mk rename to src/makefiles/cuda_64bit.mk index 46613083188..25400f452f8 100644 --- a/src/makefiles/linux_x86_64_cuda.mk +++ b/src/makefiles/cuda_64bit.mk @@ -1,14 +1,7 @@ CUDA_INCLUDE= -I$(CUDATKDIR)/include CUDA_FLAGS = -g -Xcompiler -fPIC --verbose --machine 64 -DHAVE_CUDA - -CXXFLAGS += -DHAVE_CUDA -I$(CUDATKDIR)/include -UNAME := $(shell uname) -#aware of fact in cuda60 there is no lib64, just lib. -ifeq ($(UNAME), Darwin) -CUDA_LDFLAGS += -L$(CUDATKDIR)/lib -Wl,-rpath,$(CUDATKDIR)/lib -else +CXXFLAGS += -DHAVE_CUDA -I$(CUDATKDIR)/include CUDA_LDFLAGS += -L$(CUDATKDIR)/lib64 -Wl,-rpath,$(CUDATKDIR)/lib64 -endif CUDA_LDLIBS += -lcublas -lcudart #LDLIBS : The libs are loaded later than static libs in implicit rule From 6b982f6fc1c2b87ba4fc608886a6b981a618b6d2 Mon Sep 17 00:00:00 2001 From: David Snyder Date: Fri, 19 Feb 2016 16:40:26 -0500 Subject: [PATCH 22/32] xvector: extending get_egs and related scripts for xvector training --- .../local/xvector/prepare_perturbed_data.sh | 6 +- .../steps/nnet3/xvector/allocate_examples.py | 64 ++++-- egs/wsj/s5/steps/nnet3/xvector/get_egs.sh | 217 +++++++----------- src/nnet3bin/nnet3-xvector-get-egs.cc | 21 +- 4 files changed, 137 insertions(+), 171 deletions(-) diff --git a/egs/swbd/s5c/local/xvector/prepare_perturbed_data.sh b/egs/swbd/s5c/local/xvector/prepare_perturbed_data.sh index ea863cb672b..7ce4d553733 100755 --- a/egs/swbd/s5c/local/xvector/prepare_perturbed_data.sh +++ b/egs/swbd/s5c/local/xvector/prepare_perturbed_data.sh @@ -21,11 +21,11 @@ if [ $stage -le 1 ]; then if [ -d data/${datadir}_sp ]; then echo "$0: directory ${datadir}_sp already exists, skipping creating it." else - utils/data/perturb_data_dir_speed_3way.sh ${datadir} ${datadir}_sp - utils/data/perturb_data_dir_volume.sh ${datadir}_sp + utils/data/perturb_data_dir_speed_3way.sh data/${datadir} data/${datadir}_sp + utils/data/perturb_data_dir_volume.sh data/${datadir}_sp fi if [ -f data/${datadir}_sp_hires/feats.scp ]; then - echo "$0: directory ${datadir}_sp_hires/feats.scp already exists, skipping creating it." + echo "$0: directory data/${datadir}_sp_hires/feats.scp already exists, skipping creating it." else mfccdir=mfcc utils/copy_data_dir.sh data/${datadir}_sp data/${datadir}_sp_hires diff --git a/egs/wsj/s5/steps/nnet3/xvector/allocate_examples.py b/egs/wsj/s5/steps/nnet3/xvector/allocate_examples.py index 24d6bdf217a..39e11f23b85 100755 --- a/egs/wsj/s5/steps/nnet3/xvector/allocate_examples.py +++ b/egs/wsj/s5/steps/nnet3/xvector/allocate_examples.py @@ -9,21 +9,21 @@ # --num-archives=169 --num-jobs=24 exp/xvector_a/egs/temp/utt2len.train exp/xvector_a/egs # # and this program outputs certain things to the temp directory (exp/xvector_a/egs/temp in this case) -# that will enable you to dump the xvectors. What we'll eventually be doing is invoking the following -# program with something like the following args: +# that will enable you to dump the chunks for xvector trainign. What we'll eventually be doing is invoking +# the following program with something like the following args: # -# nnet3-xvector-get-egs1 [options] exp/xvector_a/temp/ranges.1 scp:data/train/feats.scp \ +# nnet3-xvector-get-egs [options] exp/xvector_a/temp/ranges.1 scp:data/train/feats.scp \ # ark:exp/xvector_a/egs/egs_temp.1.ark ark:exp/xvector_a/egs/egs_temp.2.ark \ # ark:exp/xvector_a/egs/egs_temp.3.ark # # where exp/xvector_a/temp/ranges.1 contains something like the following: # -# utt1 3 0 65 112 110 -# utt1 0 160 50 214 180 +# utt1 3 0 0 65 112 110 +# utt1 0 2 160 50 214 180 # utt2 ... # # where each line is interpreted as follows: -# +# # and for each line we create an eg (containing two possibly-different-length chunks of data from the # same utterance), to one of the output archives. The list of archives corresponding to # ranges.n will be written to output.n, so in exp/xvector_a/temp/outputs.1 we'd have: @@ -52,10 +52,18 @@ parser = argparse.ArgumentParser(description="Writes ranges.*, outputs.* and archive_chunk_lengths files " "in preparation for dumping egs for xvector training.", epilog="Called by steps/nnet3/xvector/get_egs.sh") +parser.add_argument("--prefix", type=str, default="", + help="Adds a prefix to the output files. This is used to distinguish between the train " + "and diagnostic files.") parser.add_argument("--min-frames-per-chunk", type=int, default=50, help="Minimum number of frames-per-chunk used for any archive") parser.add_argument("--max-frames-per-chunk", type=int, default=300, help="Maximum number of frames-per-chunk used for any archive") +parser.add_argument("--randomize-chunk-length", type=str, + help="If true, randomly pick a chunk length in [min-frames-per-chunk, max-frames-per-chunk]." + "If false, the chunk length varies from min-frames-per-chunk to max-frames-per-chunk" + "according to a geometric sequence.", + default="true", choices = ["false", "true"]) parser.add_argument("--frames-per-iter", type=int, default=1000000, help="Target number of frames for each archive") parser.add_argument("--num-archives", type=int, default=-1, @@ -137,6 +145,18 @@ def RandomChunkLength(): ans = int(math.exp(log_value) + 0.45) return ans +# This function returns an integer in the range +# [min-frames-per-chunk, max-frames-per-chunk] according to a geometric +# sequence. For example, suppose min-frames-per-chunk is 50, +# max-frames-per-chunk is 200, and args.num_archives is 3. Then the +# lengths for archives 0, 1, and 2 will be 50, 100, and 200. +def DeterministicChunkLength(archive_id): + ans = int(math.pow(float(args.max_frames_per_chunk) / + args.min_frames_per_chunk, float(archive_id) / + (args.num_archives-1)) * args.min_frames_per_chunk + 0.5) + return ans + + # given an utterance length utt_length (in frames) and two desired chunk lengths # (length1 and length2) whose sum is <= utt_length, @@ -180,14 +200,21 @@ def GetRandomOffsets(utt_length, length1, length2): # an array of 3-tuples (utterance-index, offset1, offset2) all_egs= [] -info_f = open(args.egs_dir + "/temp/archive_chunk_lengths", "w") -if info_f is None: - sys.exit("Error opening file {0}/temp/archive_chunk_lengths".format(args.egs_dir)); +prefix = "" +if args.prefix != "": + prefix = args.prefix + "_" +info_f = open(args.egs_dir + "/temp/" + prefix + "archive_chunk_lengths", "w") +if info_f is None: + sys.exit(str("Error opening file {0}/temp/" + prefix + "archive_chunk_lengths").format(args.egs_dir)); for archive_index in range(args.num_archives): print("Processing archive {0}".format(archive_index + 1)) - length1 = RandomChunkLength(); - length2 = RandomChunkLength(); + if args.randomize_chunk_length == "true": + length1 = RandomChunkLength(); + length2 = length1 + else: + length1 = DeterministicChunkLength(archive_index); + length2 = length1 print("{0} {1} {2}".format(archive_index + 1, length1, length2), file=info_f) archive_chunk_lengths.append( (length1, length2) ) tot_length = length1 + length2 @@ -218,12 +245,13 @@ def GetRandomOffsets(utt_length, length1, length2): for (utterance_index, offset1, offset2) in all_egs[cur_archive]: this_ranges.append( (utterance_index, i, offset1, offset2) ) cur_archive = cur_archive + 1 - f = open(args.egs_dir + "/temp/ranges." + str(job + 1), "w") + f = open(args.egs_dir + "/temp/" + prefix + "ranges." + str(job + 1), "w") if f is None: - sys.exit("Error opening file " + args.egs_dir + "/temp/ranges." + str(job + 1)) + sys.exit("Error opening file " + args.egs_dir + "/temp/" + prefix + "ranges." + str(job + 1)) for (utterance_index, i, offset1, offset2) in sorted(this_ranges): archive_index = this_archives_for_job[i] - print("{0} {1} {2} {3} {4}".format(utt_ids[utterance_index], + print("{0} {1} {2} {3} {4} {5} {6}".format(utt_ids[utterance_index], + i, archive_index + 1, offset1, archive_chunk_lengths[archive_index][0], @@ -232,13 +260,13 @@ def GetRandomOffsets(utt_length, length1, length2): file=f) f.close() - f = open(args.egs_dir + "/temp/outputs." + str(job + 1), "w") + f = open(args.egs_dir + "/temp/" + prefix + "outputs." + str(job + 1), "w") if f is None: - sys.exit("Error opening file " + args.egs_dir + "/temp/outputs." + str(job + 1)) - print( " ".join([ "{0}/egs_temp.{1}.ark".format(args.egs_dir, n + 1) for n in this_archives_for_job ]), + sys.exit("Error opening file " + args.egs_dir + "/temp/" + prefix + "outputs." + str(job + 1)) + print( " ".join([ str("{0}/" + prefix + "egs_temp.{1}.ark").format(args.egs_dir, n + 1) for n in this_archives_for_job ]), file=f) f.close() -print("allocate_examples.py: finished generating ranges.* and outputs.* files") +print("allocate_examples.py: finished generating " + prefix + "ranges.* and " + prefix + "outputs.* files") diff --git a/egs/wsj/s5/steps/nnet3/xvector/get_egs.sh b/egs/wsj/s5/steps/nnet3/xvector/get_egs.sh index 4b0d558bc09..2ab81395d47 100755 --- a/egs/wsj/s5/steps/nnet3/xvector/get_egs.sh +++ b/egs/wsj/s5/steps/nnet3/xvector/get_egs.sh @@ -1,6 +1,8 @@ #!/bin/bash -# Copyright 2012-2015 Johns Hopkins University (Author: Daniel Povey). Apache 2.0. +# Copyright 2012-2016 Johns Hopkins University (Author: Daniel Povey) +# 2016 David Snyder +# Apache 2.0 # # This script dumps training examples (egs) for xvector training. These egs # have only an input and no outputs (the inputs are typically MFCCs). The egs @@ -15,12 +17,6 @@ # This script, which will generally be called from other neural-net training # scripts, extracts the training examples used to train the neural net (and also # the validation examples used for diagnostics), and puts them in separate archives. -# -# This script dumps egs with several frames of labels, controlled by the -# frames_per_eg config variable (default: 8). This takes many times less disk -# space because typically we have 4 to 7 frames of context on the left and -# right, and this ends up getting shared. This is at the expense of slightly -# higher disk I/O while training. # Begin configuration section. @@ -94,6 +90,9 @@ if [ ! -f $data/feats.scp ]; then exit 1 fi +sdata=$data/split$nj +utils/split_data.sh $data $nj + if [ ! -f $data/utt2dur ]; then # getting this utt2dur will normally be more lightweight than # getting the exact utterance-to-length map. @@ -120,28 +119,42 @@ if [ $stage -le 1 ]; then echo "$0: getting list of validation utterances" # Get list of validation utterances. - awk '{print $1}' $data/utt2spk | utils/shuffle_list.pl | head -$num_utts_subset \ + awk '{print $1}' $data/utt2spk | utils/shuffle_list.pl | head -$num_heldout_utts \ > $temp/valid_uttlist || exit 1; + awk '{print $1}' $data/utt2spk | utils/filter_scp.pl --exclude $temp/valid_uttlist \ + | utils/shuffle_list.pl | head -$num_heldout_utts > $temp/train_subset_uttlist || exit 1; + if [ -f $data/utt2uniq ]; then # this matters if you use data augmentation. - echo "File $data/utt2uniq exists, so augmenting valid_uttlist to" - echo "include all perturbed versions of the same 'real' utterances." - mv $temp/valid_uttlist $temp/valid_uttlist.tmp utils/utt2spk_to_spk2utt.pl $data/utt2uniq > $temp/uniq2utt - cat $temp/valid_uttlist.tmp | utils/apply_map.pl $data/utt2uniq | \ - sort | uniq | utils/apply_map.pl $temp/uniq2utt | \ - awk '{for(n=1;n<=NF;n++) print $n;}' | sort > $temp/valid_uttlist - rm $temp/uniq2utt $temp/valid_uttlist.tmp + for uttlist in valid_uttlist train_subset_uttlist; do + echo "File $data/utt2uniq exists, so augmenting $uttlist to" + echo "include all perturbed versions of the same 'real' utterances." + mv $temp/$uttlist $temp/${uttlist}.tmp + cat $temp/$uttlist.tmp | utils/apply_map.pl $data/utt2uniq | \ + sort | uniq | utils/apply_map.pl $temp/uniq2utt | \ + awk '{for(n=1;n<=NF;n++) print $n;}' | sort > $temp/$uttlist + done + rm $temp/uniq2utt $temp/$uttlist.tmp fi + + awk '{print $1}' $temp/utt2len | utils/filter_scp.pl --exclude $temp/valid_uttlist <$temp/utt2len > $temp/utt2len.train utils/filter_scp.pl $temp/valid_uttlist <$temp/utt2len > $temp/utt2len.valid + utils/filter_scp.pl $temp/train_subset_uttlist <$temp/utt2len > $temp/utt2len.train_subset fi +# TODO: Currently just supporting raw features +feats="scp,s,cs:utils/filter_scp.pl $temp/ranges.JOB $data/feats.scp |" +valid_feats="scp,s,cs:utils/filter_scp.pl $temp/valid_uttlist $data/feats.scp |" +train_subset_feats="scp,s,cs:utils/filter_scp.pl $temp/train_subset_uttlist $data/feats.scp |" + # first for the training data... work out how many archives. num_train_frames=$(awk '{n += $2} END{print n}' <$temp/utt2len.train) num_valid_frames=$(awk '{n += $2} END{print n}' <$temp/utt2len.valid) +num_train_subset_frames=$(awk '{n += $2} END{print n}' <$temp/utt2len.train_subset) echo $num_train_frames >$dir/info/num_frames @@ -166,147 +179,71 @@ if [ $stage -le 2 ]; then fi if [ $stage -le 3 ]; then - echo "$0: allocating examples" - $cmd $dir/log/allocate_examples.log \ + echo "$0: allocating training examples" + $cmd $dir/log/allocate_examples_train.log \ steps/nnet3/xvector/allocate_examples.py \ --min-frames-per-chunk=$min_frames_per_chunk \ --max-frames-per-chunk=$max_frames_per_chunk \ --frames-per-iter=$frames_per_iter \ --num-archives=$num_train_archives --num-jobs=$nj \ $dir/temp/utt2len.train $dir || exit 1 -fi - -# HERE - todo. - -exit 0 - + echo "$0: allocating training subset examples" + $cmd $dir/log/allocate_examples_train_subset.log \ + steps/nnet3/xvector/allocate_examples.py \ + --prefix train_subset \ + --min-frames-per-chunk=$min_frames_per_chunk \ + --max-frames-per-chunk=$max_frames_per_chunk \ + --randomize-chunk-length false \ + --frames-per-iter=$frames_per_iter_diagnostic \ + --num-archives=$num_diagnostic_archives --num-jobs=1 \ + $dir/temp/utt2len.train_subset $dir || exit 1 - -if [ $stage -le 2 ]; then - echo "$0: copying data alignments" - for id in $(seq $num_ali_jobs); do gunzip -c $alidir/ali.$id.gz; done | \ - copy-int-vector ark:- ark,scp:$dir/ali.ark,$dir/ali.scp || exit 1; -fi - -egs_opts="--left-context=$left_context --right-context=$right_context --compress=$compress" - -[ -z $valid_left_context ] && valid_left_context=$left_context; -[ -z $valid_right_context ] && valid_right_context=$right_context; -valid_egs_opts="--left-context=$valid_left_context --right-context=$valid_right_context --compress=$compress" - -echo $left_context > $dir/info/left_context -echo $right_context > $dir/info/right_context -num_pdfs=$(tree-info --print-args=false $alidir/tree | grep num-pdfs | awk '{print $2}') -if [ $stage -le 3 ]; then - echo "$0: Getting validation and training subset examples." - rm $dir/.error 2>/dev/null - echo "$0: ... extracting validation and training-subset alignments." - - utils/filter_scp.pl <(cat $dir/valid_uttlist $dir/train_subset_uttlist) \ - <$dir/ali.scp >$dir/ali_special.scp - - $cmd $dir/log/create_valid_subset.log \ - nnet3-get-egs --num-pdfs=$num_pdfs $valid_ivector_opt $valid_egs_opts "$valid_feats" \ - "ark,s,cs:ali-to-pdf $alidir/final.mdl scp:$dir/ali_special.scp ark:- | ali-to-post ark:- ark:- |" \ - "ark:$dir/valid_all.egs" || touch $dir/.error & - $cmd $dir/log/create_train_subset.log \ - nnet3-get-egs --num-pdfs=$num_pdfs $train_subset_ivector_opt $valid_egs_opts "$train_subset_feats" \ - "ark,s,cs:ali-to-pdf $alidir/final.mdl scp:$dir/ali_special.scp ark:- | ali-to-post ark:- ark:- |" \ - "ark:$dir/train_subset_all.egs" || touch $dir/.error & - wait; - [ -f $dir/.error ] && echo "Error detected while creating train/valid egs" && exit 1 - echo "... Getting subsets of validation examples for diagnostics and combination." - $cmd $dir/log/create_valid_subset_combine.log \ - nnet3-subset-egs --n=$num_valid_frames_combine ark:$dir/valid_all.egs \ - ark:$dir/valid_combine.egs || touch $dir/.error & - $cmd $dir/log/create_valid_subset_diagnostic.log \ - nnet3-subset-egs --n=$num_frames_diagnostic ark:$dir/valid_all.egs \ - ark:$dir/valid_diagnostic.egs || touch $dir/.error & - - $cmd $dir/log/create_train_subset_combine.log \ - nnet3-subset-egs --n=$num_train_frames_combine ark:$dir/train_subset_all.egs \ - ark:$dir/train_combine.egs || touch $dir/.error & - $cmd $dir/log/create_train_subset_diagnostic.log \ - nnet3-subset-egs --n=$num_frames_diagnostic ark:$dir/train_subset_all.egs \ - ark:$dir/train_diagnostic.egs || touch $dir/.error & - wait - sleep 5 # wait for file system to sync. - cat $dir/valid_combine.egs $dir/train_combine.egs > $dir/combine.egs - - for f in $dir/{combine,train_diagnostic,valid_diagnostic}.egs; do - [ ! -s $f ] && echo "No examples in file $f" && exit 1; - done - rm $dir/valid_all.egs $dir/train_subset_all.egs $dir/{train,valid}_combine.egs + echo "$0: allocating validation examples" + $cmd $dir/log/allocate_examples_valid.log \ + steps/nnet3/xvector/allocate_examples.py \ + --prefix valid \ + --min-frames-per-chunk=$min_frames_per_chunk \ + --max-frames-per-chunk=$max_frames_per_chunk \ + --randomize-chunk-length false \ + --frames-per-iter=$frames_per_iter_diagnostic \ + --frames-per-iter=$frames_per_iter_diagnostic \ + --num-archives=$num_diagnostic_archives --num-jobs=1 \ + $dir/temp/utt2len.valid $dir || exit 1 fi if [ $stage -le 4 ]; then - # create egs_orig.*.*.ark; the first index goes to $nj, - # the second to $num_archives_intermediate. - - egs_list= - for n in $(seq $num_archives_intermediate); do - egs_list="$egs_list ark:$dir/egs_orig.JOB.$n.ark" - done echo "$0: Generating training examples on disk" - # The examples will go round-robin to egs_list. - $cmd JOB=1:$nj $dir/log/get_egs.JOB.log \ - nnet3-get-egs --num-pdfs=$num_pdfs $ivector_opt $egs_opts --num-frames=$frames_per_eg "$feats" \ - "ark,s,cs:filter_scp.pl $sdata/JOB/utt2spk $dir/ali.scp | ali-to-pdf $alidir/final.mdl scp:- ark:- | ali-to-post ark:- ark:- |" ark:- \| \ - nnet3-copy-egs --random=true --srand=JOB ark:- $egs_list || exit 1; + for g in $(seq $nj); do + outputs=`awk '{for(i=1;i<=NF;i++)printf("ark:%s ",$i);}' $temp/outputs.$g` + $cmd $dir/log/train_create_examples.$g.log \ + nnet3-xvector-get-egs $temp/ranges.$g \ + "`echo $feats | sed s/JOB/$g/g`" $outputs || exit 1 & + done + wait + train_subset_outputs=`awk '{for(i=1;i<=NF;i++)printf("ark:%s ",$i);}' $temp/train_subset_outputs.1` + echo "$0: Generating training subset examples on disk" + $cmd $dir/log/train_subset_create_examples.1.log \ + nnet3-xvector-get-egs $temp/train_subset_ranges.1 \ + "$train_subset_feats" $train_subset_outputs || exit 1 + valid_outputs=`awk '{for(i=1;i<=NF;i++)printf("ark:%s ",$i);}' $temp/valid_outputs.1` + echo "$0: Generating validation examples on disk" + $cmd $dir/log/valid_create_examples.1.log \ + nnet3-xvector-get-egs $temp/valid_ranges.1 \ + "$valid_feats" $valid_outputs || exit 1 fi if [ $stage -le 5 ]; then - echo "$0: recombining and shuffling order of archives on disk" - # combine all the "egs_orig.*.JOB.scp" (over the $nj splits of the data) and - # shuffle the order, writing to the egs.JOB.ark - - # the input is a concatenation over the input jobs. - egs_list= - for n in $(seq $nj); do - egs_list="$egs_list $dir/egs_orig.$n.JOB.ark" - done - - if [ $archives_multiple == 1 ]; then # normal case. - $cmd --max-jobs-run $nj JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \ - nnet3-shuffle-egs --srand=JOB "ark:cat $egs_list|" ark:$dir/egs.JOB.ark || exit 1; - else - # we need to shuffle the 'intermediate archives' and then split into the - # final archives. we create soft links to manage this splitting, because - # otherwise managing the output names is quite difficult (and we don't want - # to submit separate queue jobs for each intermediate archive, because then - # the --max-jobs-run option is hard to enforce). - output_archives="$(for y in $(seq $archives_multiple); do echo ark:$dir/egs.JOB.$y.ark; done)" - for x in $(seq $num_archives_intermediate); do - for y in $(seq $archives_multiple); do - archive_index=$[($x-1)*$archives_multiple+$y] - # egs.intermediate_archive.{1,2,...}.ark will point to egs.archive.ark - ln -sf egs.$archive_index.ark $dir/egs.$x.$y.ark || exit 1 - done - done - $cmd --max-jobs-run $nj JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \ - nnet3-shuffle-egs --srand=JOB "ark:cat $egs_list|" ark:- \| \ - nnet3-copy-egs ark:- $output_archives || exit 1; - fi - + echo "$0: Shuffling order of archives on disk" + $cmd --max-jobs-run $nj JOB=1:$num_train_archives $dir/log/shuffle.JOB.log \ + nnet3-shuffle-egs --srand=JOB ark:$dir/egs_temp.JOB.ark ark:$dir/egs.JOB.ark || exit 1; + + $cmd --max-jobs-run $nj JOB=1:$num_diagnostic_archives $dir/log/train_subset_shuffle.JOB.log \ + nnet3-shuffle-egs --srand=JOB ark:$dir/train_subset_egs_temp.JOB.ark ark:$dir/train_diagnostic_egs.JOB.ark || exit 1; + $cmd --max-jobs-run $nj JOB=1:$num_diagnostic_archives $dir/log/valid_shuffle.JOB.log \ + nnet3-shuffle-egs --srand=JOB ark:$dir/valid_egs_temp.JOB.ark ark:$dir/valid_diagnostic_egs.JOB.ark || exit 1; fi -if [ $stage -le 6 ]; then - echo "$0: removing temporary archives" - for x in $(seq $nj); do - for y in $(seq $num_archives_intermediate); do - file=$dir/egs_orig.$x.$y.ark - [ -L $file ] && rm $(readlink -f $file) - rm $file - done - done - if [ $archives_multiple -gt 1 ]; then - # there are some extra soft links that we should delete. - for f in $dir/egs.*.*.ark; do rm $f; done - fi - echo "$0: removing temporary alignments and transforms" - # Ignore errors below because trans.* might not exist. - rm $dir/{ali,trans}.{ark,scp} 2>/dev/null -fi +#TODO: Probably need to cleanup the temp egs. echo "$0: Finished preparing training examples" diff --git a/src/nnet3bin/nnet3-xvector-get-egs.cc b/src/nnet3bin/nnet3-xvector-get-egs.cc index 24e50560b54..55ba475c0fe 100644 --- a/src/nnet3bin/nnet3-xvector-get-egs.cc +++ b/src/nnet3bin/nnet3-xvector-get-egs.cc @@ -49,15 +49,15 @@ static void ProcessRangeFile(const std::string &range_rxfilename, ChunkPairInfo *pair = new ChunkPairInfo(); std::vector fields; SplitStringToVector(line, " \t\n\r", true, &fields); - if (fields.size() != 6) - KALDI_ERR << "Expected 6 fields in line of range file, got " + if (fields.size() != 7) + KALDI_ERR << "Expected 7 fields in line of range file, got " << fields.size() << " instead."; std::string utt = fields[0], - start_frame1_str = fields[2], - num_frames1_str = fields[3], - start_frame2_str = fields[4], - num_frames2_str = fields[5]; + start_frame1_str = fields[3], + num_frames1_str = fields[4], + start_frame2_str = fields[5], + num_frames2_str = fields[6]; if (!ConvertStringToInteger(fields[1], &(pair->output_archive_id)) || !ConvertStringToInteger(start_frame1_str, &(pair->start_frame1)) @@ -166,11 +166,12 @@ int main(int argc, char *argv[]) { "the same utterance. The location and length of the feature chunks\n" "are specified in the 'ranges' file. Each line is interpreted as\n" "follows:\n" - " " - " \n" + " " + " " + " \n" "For example:\n" - " utt1 3 0 65 112 110\n" - " utt1 0 160 50 214 180\n" + " utt1 3 13 0 65 112 110\n" + " utt1 0 10 160 50 214 180\n" " utt2 ...\n" "\n" "Usage: nnet3-xvector-get-egs [options] " From 96413f5c027cff4cfc2912e41e7935940f4a497d Mon Sep 17 00:00:00 2001 From: David Snyder Date: Fri, 19 Feb 2016 16:47:58 -0500 Subject: [PATCH 23/32] xvector: fixing typo --- egs/wsj/s5/steps/nnet3/xvector/allocate_examples.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/egs/wsj/s5/steps/nnet3/xvector/allocate_examples.py b/egs/wsj/s5/steps/nnet3/xvector/allocate_examples.py index 39e11f23b85..8ef0ded7c15 100755 --- a/egs/wsj/s5/steps/nnet3/xvector/allocate_examples.py +++ b/egs/wsj/s5/steps/nnet3/xvector/allocate_examples.py @@ -9,7 +9,7 @@ # --num-archives=169 --num-jobs=24 exp/xvector_a/egs/temp/utt2len.train exp/xvector_a/egs # # and this program outputs certain things to the temp directory (exp/xvector_a/egs/temp in this case) -# that will enable you to dump the chunks for xvector trainign. What we'll eventually be doing is invoking +# that will enable you to dump the chunks for xvector training. What we'll eventually be doing is invoking # the following program with something like the following args: # # nnet3-xvector-get-egs [options] exp/xvector_a/temp/ranges.1 scp:data/train/feats.scp \ From 90af624290c987ddbd89dee48c0462093c62cafd Mon Sep 17 00:00:00 2001 From: Dan Povey Date: Fri, 19 Feb 2016 17:24:38 -0500 Subject: [PATCH 24/32] chain branch: bug-fix in self-repair code for sigmoid units --- src/nnet3/nnet-simple-component.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc index aadd0c05a1d..28fa24b4fae 100644 --- a/src/nnet3/nnet-simple-component.cc +++ b/src/nnet3/nnet-simple-component.cc @@ -510,7 +510,7 @@ void SigmoidComponent::RepairGradients( in_deriv->AddMatDiagVec(-2.0 * self_repair_scale_ / repair_probability, out_value, kNoTrans, thresholds_vec); - in_deriv->AddVecToCols(self_repair_scale_ / repair_probability, + in_deriv->AddVecToRows(self_repair_scale_ / repair_probability, thresholds_vec); } From e980e75778eea797ed96e3583d3ac11c713d2175 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Fri, 19 Feb 2016 19:25:19 -0500 Subject: [PATCH 25/32] small cosmetic change to RM example script for chain models --- egs/rm/s5/local/chain/run_tdnn_5f.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/egs/rm/s5/local/chain/run_tdnn_5f.sh b/egs/rm/s5/local/chain/run_tdnn_5f.sh index 3cd46707ef3..0379d16fe13 100644 --- a/egs/rm/s5/local/chain/run_tdnn_5f.sh +++ b/egs/rm/s5/local/chain/run_tdnn_5f.sh @@ -52,7 +52,7 @@ if [ $stage -le 4 ]; then # Get the alignments as lattices (gives the CTC training more freedom). # use the same num-jobs as the alignments nj=$(cat exp/tri3b_ali/num_jobs) || exit 1; - steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd -l q=all.q" data/train \ + steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/train \ data/lang exp/tri3b exp/tri3b_lats rm exp/tri3b_lats/fsts.*.gz # save space fi @@ -74,7 +74,7 @@ if [ $stage -le 6 ]; then # Build a tree using our new topology. steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \ --leftmost-questions-truncate $leftmost_questions_truncate \ - --cmd "$train_cmd -l q=all.q" 1200 data/train $lang $ali_dir $treedir + --cmd "$train_cmd" 1200 data/train $lang $ali_dir $treedir fi if [ $stage -le 7 ]; then From 0ca297f1239c11a19728694d2586b574250cca90 Mon Sep 17 00:00:00 2001 From: David Snyder Date: Sat, 20 Feb 2016 13:59:45 -0500 Subject: [PATCH 26/32] xvector: fixing makefiles and paths for xvector code --- egs/swbd/s5c/path.sh | 2 +- src/xvector/nnet-xvector-diagnostics.h | 6 +++--- src/xvector/nnet-xvector-training.h | 6 +++--- src/xvector/xvector.h | 4 ++-- src/xvectorbin/Makefile | 5 +++-- 5 files changed, 12 insertions(+), 11 deletions(-) diff --git a/egs/swbd/s5c/path.sh b/egs/swbd/s5c/path.sh index c6b8450c86a..a07adf42589 100755 --- a/egs/swbd/s5c/path.sh +++ b/egs/swbd/s5c/path.sh @@ -1,4 +1,4 @@ export KALDI_ROOT=`pwd`/../../.. -export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin:$KALDI_ROOT/src/nnet3bin:$KALDI_ROOT/src/online2bin/:$KALDI_ROOT/src/ivectorbin/:$KALDI_ROOT/src/lmbin/:$KALDI_ROOT/src/chainbin:$KALDI_ROOT/src/kwsbin:$PWD:$PATH +export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin:$KALDI_ROOT/src/nnet3bin:$KALDI_ROOT/src/online2bin/:$KALDI_ROOT/src/ivectorbin/:$KALDI_ROOT/src/xvectorbin/:$KALDI_ROOT/src/lmbin/:$KALDI_ROOT/src/chainbin:$KALDI_ROOT/src/kwsbin:$PWD:$PATH export LC_ALL=C diff --git a/src/xvector/nnet-xvector-diagnostics.h b/src/xvector/nnet-xvector-diagnostics.h index d43a38a3ed4..046088518b1 100644 --- a/src/xvector/nnet-xvector-diagnostics.h +++ b/src/xvector/nnet-xvector-diagnostics.h @@ -18,8 +18,8 @@ // See the Apache 2 License for the specific language governing permissions and // limitations under the License. -#ifndef KALDI_NNET3_NNET_XVECTOR_DIAGNOSTICS_H_ -#define KALDI_NNET3_NNET_XVECTOR_DIAGNOSTICS_H_ +#ifndef KALDI_XVECTOR_NNET_XVECTOR_DIAGNOSTICS_H_ +#define KALDI_XVECTOR_NNET_XVECTOR_DIAGNOSTICS_H_ #include "nnet3/nnet-example.h" #include "nnet3/nnet-computation.h" @@ -89,4 +89,4 @@ class NnetXvectorComputeProb { } // namespace nnet3 } // namespace kaldi -#endif // KALDI_NNET3_NNET_XVECTOR_DIAGNOSTICS_H_ +#endif // KALDI_XVECTOR_NNET_XVECTOR_DIAGNOSTICS_H_ diff --git a/src/xvector/nnet-xvector-training.h b/src/xvector/nnet-xvector-training.h index e8fb3d20e6a..58ff9211310 100644 --- a/src/xvector/nnet-xvector-training.h +++ b/src/xvector/nnet-xvector-training.h @@ -18,8 +18,8 @@ // See the Apache 2 License for the specific language governing permissions and // limitations under the License. -#ifndef KALDI_IVECTOR_NNET_XVECTOR_TRAINING_H_ -#define KALDI_IVECTOR_NNET_XVECTOR_TRAINING_H_ +#ifndef KALDI_XVECTOR_NNET_XVECTOR_TRAINING_H_ +#define KALDI_XVECTOR_NNET_XVECTOR_TRAINING_H_ #include "nnet3/nnet-example.h" #include "nnet3/nnet-computation.h" @@ -86,4 +86,4 @@ void GetComputationRequestXvector(const Nnet &nnet, } // namespace nnet3 } // namespace kaldi -#endif // +#endif // diff --git a/src/xvector/xvector.h b/src/xvector/xvector.h index 50d58ec7a93..75083533acd 100644 --- a/src/xvector/xvector.h +++ b/src/xvector/xvector.h @@ -18,8 +18,8 @@ // See the Apache 2 License for the specific language governing permissions and // limitations under the License. -#ifndef KALDI_IVECTOR_XVECTOR_H_ -#define KALDI_IVECTOR_XVECTOR_H_ +#ifndef KALDI_XVECTOR_XVECTOR_H_ +#define KALDI_XVECTOR_XVECTOR_H_ #include #include "base/kaldi-common.h" diff --git a/src/xvectorbin/Makefile b/src/xvectorbin/Makefile index 63a689a1e44..1dc1bee6e0a 100644 --- a/src/xvectorbin/Makefile +++ b/src/xvectorbin/Makefile @@ -6,7 +6,8 @@ include ../kaldi.mk LDFLAGS += $(CUDA_LDFLAGS) LDLIBS += $(CUDA_LDLIBS) -BINFILES = nnet3-xvector-get-egs +BINFILES = nnet3-xvector-get-egs nnet3-xvector-compute-prob \ + nnet3-xvector-show-progress nnet3-xvector-train OBJFILES = @@ -15,7 +16,7 @@ cuda-compiled.o: ../kaldi.mk TESTFILES = -ADDLIBS = ../nnet3/kaldi-nnet3.a ../gmm/kaldi-gmm.a \ +ADDLIBS = ../xvector/kaldi-xvector.a ../nnet3/kaldi-nnet3.a ../gmm/kaldi-gmm.a \ ../decoder/kaldi-decoder.a ../lat/kaldi-lat.a ../hmm/kaldi-hmm.a \ ../transform/kaldi-transform.a ../tree/kaldi-tree.a \ ../thread/kaldi-thread.a ../cudamatrix/kaldi-cudamatrix.a \ From c1fac7ef61e4afcd9a5fa07f273edaf06f479cc3 Mon Sep 17 00:00:00 2001 From: David Snyder Date: Sat, 20 Feb 2016 14:03:30 -0500 Subject: [PATCH 27/32] xvector: fixing Makefiles --- src/Makefile | 9 +++++---- src/ivector/Makefile | 3 +-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/Makefile b/src/Makefile index 57a4b98e0aa..c3fe511486f 100644 --- a/src/Makefile +++ b/src/Makefile @@ -9,13 +9,13 @@ SUBDIRS = base matrix util feat tree thread gmm transform sgmm \ fstext hmm lm decoder lat kws cudamatrix nnet \ bin fstbin gmmbin fgmmbin sgmmbin featbin \ nnetbin latbin sgmm2 sgmm2bin nnet2 nnet3 chain nnet3bin nnet2bin kwsbin \ - ivector ivectorbin online2 online2bin lmbin chainbin + ivector ivectorbin xvector xvectorbin online2 online2bin lmbin chainbin MEMTESTDIRS = base matrix util feat tree thread gmm transform sgmm \ fstext hmm lm decoder lat nnet kws chain \ bin fstbin gmmbin fgmmbin sgmmbin featbin \ nnetbin latbin sgmm2 nnet2 nnet3 nnet2bin nnet3bin sgmm2bin kwsbin \ - ivector ivectorbin online2 online2bin lmbin + ivector ivectorbin xvector xvectorbin online2 online2bin lmbin CUDAMEMTESTDIR = cudamatrix @@ -145,9 +145,9 @@ $(EXT_SUBDIRS) : mklibdir # this is necessary for correct parallel compilation #1)The tools depend on all the libraries -bin fstbin gmmbin fgmmbin sgmmbin sgmm2bin featbin nnetbin nnet2bin nnet3bin chainbin latbin ivectorbin lmbin kwsbin online2bin: \ +bin fstbin gmmbin fgmmbin sgmmbin sgmm2bin featbin nnetbin nnet2bin nnet3bin chainbin latbin ivectorbin xvectorbin lmbin kwsbin online2bin: \ base matrix util feat tree optimization thread gmm transform sgmm sgmm2 fstext hmm \ - lm decoder lat cudamatrix nnet nnet2 nnet3 ivector + lm decoder lat cudamatrix nnet nnet2 nnet3 ivector xvector #2)The libraries have inter-dependencies base: @@ -172,6 +172,7 @@ nnet2: base util matrix thread lat gmm hmm tree transform cudamatrix nnet3: base util matrix thread lat gmm hmm tree transform cudamatrix chain chain: lat hmm tree fstext matrix cudamatrix util base ivector: base util matrix thread transform tree gmm +xvector: base util matrix cudamatrix nnet3 #3)Dependencies for optional parts of Kaldi onlinebin: base matrix util feat tree optimization gmm transform sgmm sgmm2 fstext hmm lm decoder lat cudamatrix nnet nnet2 online thread # python-kaldi-decoding: base matrix util feat tree optimization thread gmm transform sgmm sgmm2 fstext hmm decoder lat online diff --git a/src/ivector/Makefile b/src/ivector/Makefile index bbf4b01faf9..879cc6e69b2 100644 --- a/src/ivector/Makefile +++ b/src/ivector/Makefile @@ -15,8 +15,7 @@ OBJFILES = ivector-extractor.o voice-activity-detection.o plda.o logistic-regres LIBNAME = kaldi-ivector ADDLIBS = ../gmm/kaldi-gmm.a ../tree/kaldi-tree.a ../transform/kaldi-transform.a \ - ../thread/kaldi-thread.a ../nnet3/kaldi-nnet3.a ../cudamatrix/kaldi-cudamatrix.a \ - ../matrix/kaldi-matrix.a ../base/kaldi-base.a \ + ../thread/kaldi-thread.a ../matrix/kaldi-matrix.a ../base/kaldi-base.a \ ../util/kaldi-util.a include ../makefiles/default_rules.mk From 5cdf82eec7ed11a5269ed4597bb2e5535c5c296e Mon Sep 17 00:00:00 2001 From: David Snyder Date: Sat, 20 Feb 2016 22:12:23 -0500 Subject: [PATCH 28/32] xvector: Fixes to make_jesus_configs.py --- .../s5/steps/nnet3/xvector/make_jesus_configs.py | 14 +++++++------- src/nnet3/nnet-general-component.cc | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/egs/wsj/s5/steps/nnet3/xvector/make_jesus_configs.py b/egs/wsj/s5/steps/nnet3/xvector/make_jesus_configs.py index 438f20b083e..51d58c5b89c 100755 --- a/egs/wsj/s5/steps/nnet3/xvector/make_jesus_configs.py +++ b/egs/wsj/s5/steps/nnet3/xvector/make_jesus_configs.py @@ -138,7 +138,6 @@ def __init__(self, config_string, input_dim, num_jesus_blocks, input_name): def OutputDim(self): return self.input_dim * (2 if self.output_stddev else 1) + (self.num_jesus_blocks if self.output_count else 0) - # OutputDims() returns an array of output dimensions... this node produces # one output node, but this array explains how it's split up into different types # of output (which will affect how we reorder the indexes for the jesus-layer). @@ -168,10 +167,11 @@ def WriteConfigs(self, f): self.input_name, self.left_context, self.right_context), file=f) stats_dim = 1 + self.input_dim * (2 if self.output_stddev else 1) print('component name={0}-pooling-{1}-{2} type=StatisticsPoolingComponent input-dim={3} ' - 'input-period={4} left-context={1} right-context={2} num-log-count-features=0 ' + 'input-period={4} left-context={1} right-context={2} num-log-count-features={6} ' 'output-stddevs={5} '.format(self.input_name, self.left_context, self.right_context, stats_dim, self.stats_period, - ('true' if self.output_stddev else 'false')), + ('true' if self.output_stddev else 'false'), + (self.num_jesus_blocks if self.output_count else 0)), file=f) print('component-node name={0}-pooling-{1}-{2} component={0}-pooling-{1}-{2} input={0}-extraction-{1}-{2} '.format( self.input_name, self.left_context, self.right_context), file=f) @@ -369,7 +369,6 @@ def WriteConfigs(self, f): cur_output = 'x-jesus{0}-output-affine'.format(l) - print('component name=x-final-relu type=RectifiedLinearComponent dim={0} self-repair-scale={1}'.format( cur_affine_output_dim, args.self_repair_scale), file=f) print('component-node name=x-final-relu component=x-final-relu input={0}'.format(cur_output), @@ -394,10 +393,11 @@ def WriteConfigs(self, f): # nodes. # First the S output... -s_dim = ((args.output_dim)*(args.output_dim+1)) / 2 +s_dim = ((args.output_dim)*(args.output_dim+1))/2 + print('component name=x-s type=ConstantFunctionComponent input-dim={0} output-dim={1} ' 'output-mean=0 output-stddev=0 '.format( - args.feat_dim, ((args.output_dim)+(args.output_dim+1))/2), file=f) + args.feat_dim, s_dim), file=f) print('component-node name=x-s component=x-s input=IfDefined(input)', file=f) print('component name=x-s-scale type=FixedScaleComponent dim={0} scale={1}'.format( @@ -413,7 +413,7 @@ def WriteConfigs(self, f): print('component-node name=x-b component=x-b input=IfDefined(input)', file=f) print('component name=x-b-scale type=FixedScaleComponent dim=1 scale={0}'.format( args.b_scale), file=f); -print('component-node name=x-b-scale component=x-b-scale input=input', +print('component-node name=x-b-scale component=x-b-scale input=x-b', file=f) print('output-node name=b input=x-b-scale', file=f) f.close() diff --git a/src/nnet3/nnet-general-component.cc b/src/nnet3/nnet-general-component.cc index 80793bf1d98..f40a750f894 100644 --- a/src/nnet3/nnet-general-component.cc +++ b/src/nnet3/nnet-general-component.cc @@ -847,7 +847,7 @@ void StatisticsPoolingComponent::Backprop( variance_deriv.Scale(0.5); // the deriv w.r.t. the uncentered variance is the same as w.r.t. the - // uncentered variance (since they difer by a constant term of -(mean * + // uncentered variance (since they differ by a constant term of -(mean * // mean), but we need to add to dF/dmean, the value -2.0 * mean * // dF/dvariance. mean_deriv.AddMatMatElements(-2.0, mean_value, variance_deriv, 1.0); From 72dfb918a421b65af33ab8d26e19e509a0003f7b Mon Sep 17 00:00:00 2001 From: David Snyder Date: Sun, 21 Feb 2016 19:56:43 -0500 Subject: [PATCH 29/32] xvector: add binary nnet3-xvector-merge-egs, which is the same as nnet3-merge-egs, but it doesn't rely on there being any outputs in the examples. Also some minor fixes to the trainging scripts. --- egs/wsj/s5/steps/nnet3/xvector/get_egs.sh | 1 + egs/wsj/s5/steps/nnet3/xvector/train.sh | 14 +-- src/xvectorbin/Makefile | 3 +- src/xvectorbin/nnet3-xvector-merge-egs.cc | 108 ++++++++++++++++++++++ 4 files changed, 118 insertions(+), 8 deletions(-) create mode 100644 src/xvectorbin/nnet3-xvector-merge-egs.cc diff --git a/egs/wsj/s5/steps/nnet3/xvector/get_egs.sh b/egs/wsj/s5/steps/nnet3/xvector/get_egs.sh index 2ab81395d47..7c74fff6090 100755 --- a/egs/wsj/s5/steps/nnet3/xvector/get_egs.sh +++ b/egs/wsj/s5/steps/nnet3/xvector/get_egs.sh @@ -161,6 +161,7 @@ echo $num_train_frames >$dir/info/num_frames num_train_archives=$[($num_train_frames*$num_repeats)/$frames_per_iter + 1] echo "$0: producing $num_train_archives archives for training" echo $num_train_archives > $dir/info/num_archives +echo $num_diagnostic_archives > $dir/info/num_diagnostic_archives if [ $nj -gt $num_train_archives ]; then diff --git a/egs/wsj/s5/steps/nnet3/xvector/train.sh b/egs/wsj/s5/steps/nnet3/xvector/train.sh index b66c95b3c39..c57d66f7019 100755 --- a/egs/wsj/s5/steps/nnet3/xvector/train.sh +++ b/egs/wsj/s5/steps/nnet3/xvector/train.sh @@ -132,12 +132,12 @@ while [ $x -lt $num_iters ]; do # Set off jobs doing some diagnostics, in the background. # Use the egs dir from the previous iteration for the diagnostics - $cmd JOB=1:$num_diagnostic_archives $dir/log/compute_prob_valid.$x.log \ - nnet3-xvector-compute-prob "$dir/$x.raw - |" \ - "ark:nnet3-merge-egs ark:$egs_dir/valid_diagnostic.JOB.egs ark:- |" & - $cmd JOB=1:$num_diagnostic_archives $dir/log/compute_prob_train.$x.log \ - nnet3-xvector-compute-prob "nnet3-am-copy --raw=true $dir/$x.raw - |" \ - "ark:nnet3-merge-egs ark:$egs_dir/train_diagnostic.JOB.egs ark:- |" & + $cmd JOB=1:$num_diagnostic_archives $dir/log/compute_prob_valid.$x.JOB.log \ + nnet3-xvector-compute-prob $dir/$x.raw \ + "ark:nnet3-xvector-merge-egs ark:$egs_dir/valid_diagnostic_egs.JOB.ark ark:- |" & + $cmd JOB=1:$num_diagnostic_archives $dir/log/compute_prob_train.$x.JOB.log \ + nnet3-xvector-compute-prob $dir/$x.raw \ + "ark:nnet3-xvector-merge-egs ark:$egs_dir/train_diagnostic_egs.JOB.ark ark:- |" & if [ $x -gt 0 ]; then $cmd $dir/log/progress.$x.log \ @@ -176,7 +176,7 @@ while [ $x -lt $num_iters ]; do nnet3-xvector-train $parallel_train_opts --print-interval=10 \ --max-param-change=$max_param_change \ $dir/$x.raw \ - "ark:nnet3-copy-egs ark:$egs_dir/egs.$archive.ark ark:- | nnet3-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-| nnet3-merge-egs --minibatch-size=$minibatch_size --measure-output-frames=false --discard-partial-minibatches=true ark:- ark:- |" \ + "ark:nnet3-copy-egs ark:$egs_dir/egs.$archive.ark ark:- | nnet3-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-| nnet3-xvector-merge-egs --minibatch-size=$minibatch_size --discard-partial-minibatches=true ark:- ark:- |" \ $dir/$[$x+1].$n.raw || touch $dir/.error & done wait diff --git a/src/xvectorbin/Makefile b/src/xvectorbin/Makefile index 1dc1bee6e0a..e0703ab8422 100644 --- a/src/xvectorbin/Makefile +++ b/src/xvectorbin/Makefile @@ -7,7 +7,8 @@ LDFLAGS += $(CUDA_LDFLAGS) LDLIBS += $(CUDA_LDLIBS) BINFILES = nnet3-xvector-get-egs nnet3-xvector-compute-prob \ - nnet3-xvector-show-progress nnet3-xvector-train + nnet3-xvector-show-progress nnet3-xvector-train \ + nnet3-xvector-merge-egs OBJFILES = diff --git a/src/xvectorbin/nnet3-xvector-merge-egs.cc b/src/xvectorbin/nnet3-xvector-merge-egs.cc new file mode 100644 index 00000000000..28dc9d2ee18 --- /dev/null +++ b/src/xvectorbin/nnet3-xvector-merge-egs.cc @@ -0,0 +1,108 @@ +// xvectorbin/nnet3-xvector-merge-egs.cc + +// Copyright 2016 David Snyder +// 2012-2015 Johns Hopkins University (author: Daniel Povey) +// 2014 Vimal Manohar + +// See ../../COPYING for clarification regarding multiple authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +// MERCHANTABLITY OR NON-INFRINGEMENT. +// See the Apache 2 License for the specific language governing permissions and +// limitations under the License. + +#include "base/kaldi-common.h" +#include "util/common-utils.h" +#include "nnet3/nnet-example.h" +#include "nnet3/nnet-example-utils.h" + +int main(int argc, char *argv[]) { + try { + using namespace kaldi; + using namespace kaldi::nnet3; + typedef kaldi::int32 int32; + + const char *usage = + "This copies nnet examples for xvector training from input to\n" + "output but while doing so it merges many NnetExample objects\n" + "into one, forming a minibatch consisting of single NnetExample.\n" + "Unlike nnet3-merge-egs, this binary does not expect the examples\n" + "to have any output.\n" + "\n" + "Usage: nnet3-xvector-merge-egs [options] " + "\n" + "e.g.\n" + "nnet3-xvector-merge-egs --minibatch-size=512 ark:1.egs ark:- " + "| nnet3-xvector-train ... \n" + "See also nnet3-copy-egs and nnet3-merge-egs\n"; + + bool compress = false; + int32 minibatch_size = 512; + bool discard_partial_minibatches = false; + + ParseOptions po(usage); + po.Register("minibatch-size", &minibatch_size, "Target size of " + "minibatches when merging."); + po.Register("compress", &compress, "If true, compress the output examples " + "(not recommended unless you are writing to disk)"); + po.Register("discard-partial-minibatches", &discard_partial_minibatches, + "discard any partial minibatches of 'uneven' size that may be " + "encountered at the end."); + + po.Read(argc, argv); + + if (po.NumArgs() != 2) { + po.PrintUsage(); + exit(1); + } + + std::string examples_rspecifier = po.GetArg(1), + examples_wspecifier = po.GetArg(2); + + SequentialNnetExampleReader example_reader(examples_rspecifier); + NnetExampleWriter example_writer(examples_wspecifier); + + std::vector examples; + examples.reserve(minibatch_size); + + int32 num_read = 0, num_written = 0; + while (!example_reader.Done()) { + const NnetExample &cur_eg = example_reader.Value(); + examples.resize(examples.size() + 1); + examples.back() = cur_eg; + bool minibatch_ready = + static_cast(examples.size()) >= minibatch_size; + + // Do Next() now, so we can test example_reader.Done() below . + example_reader.Next(); + num_read++; + + if (minibatch_ready || (!discard_partial_minibatches && + (example_reader.Done() && !examples.empty()))) { + NnetExample merged_eg; + MergeExamples(examples, compress, &merged_eg); + std::ostringstream ostr; + ostr << "merged-" << num_written; + num_written++; + std::string output_key = ostr.str(); + example_writer.Write(output_key, merged_eg); + examples.clear(); + } + } + KALDI_LOG << "Merged " << num_read << " egs to " << num_written << '.'; + return (num_written != 0 ? 0 : 1); + } catch(const std::exception &e) { + std::cerr << e.what() << '\n'; + return -1; + } +} + + From a8c0339edb544285fdebb55782cd11f0fe5d2041 Mon Sep 17 00:00:00 2001 From: David Snyder Date: Sun, 21 Feb 2016 20:14:30 -0500 Subject: [PATCH 30/32] xvector: undoing most of the previous commit--instead of creating a separate binary for merging egs for xvectors, the nnet3bin version was modified instead to allow for egs without output. --- egs/wsj/s5/steps/nnet3/xvector/train.sh | 6 +- src/nnet3bin/nnet3-merge-egs.cc | 11 ++- src/xvectorbin/Makefile | 3 +- src/xvectorbin/nnet3-xvector-merge-egs.cc | 108 ---------------------- 4 files changed, 10 insertions(+), 118 deletions(-) delete mode 100644 src/xvectorbin/nnet3-xvector-merge-egs.cc diff --git a/egs/wsj/s5/steps/nnet3/xvector/train.sh b/egs/wsj/s5/steps/nnet3/xvector/train.sh index c57d66f7019..a05c62c5124 100755 --- a/egs/wsj/s5/steps/nnet3/xvector/train.sh +++ b/egs/wsj/s5/steps/nnet3/xvector/train.sh @@ -134,10 +134,10 @@ while [ $x -lt $num_iters ]; do # Use the egs dir from the previous iteration for the diagnostics $cmd JOB=1:$num_diagnostic_archives $dir/log/compute_prob_valid.$x.JOB.log \ nnet3-xvector-compute-prob $dir/$x.raw \ - "ark:nnet3-xvector-merge-egs ark:$egs_dir/valid_diagnostic_egs.JOB.ark ark:- |" & + "ark:nnet3-merge-egs --measure-output-frames=false ark:$egs_dir/valid_diagnostic_egs.JOB.ark ark:- |" & $cmd JOB=1:$num_diagnostic_archives $dir/log/compute_prob_train.$x.JOB.log \ nnet3-xvector-compute-prob $dir/$x.raw \ - "ark:nnet3-xvector-merge-egs ark:$egs_dir/train_diagnostic_egs.JOB.ark ark:- |" & + "ark:nnet3-merge-egs --measure-output-frames=false ark:$egs_dir/train_diagnostic_egs.JOB.ark ark:- |" & if [ $x -gt 0 ]; then $cmd $dir/log/progress.$x.log \ @@ -176,7 +176,7 @@ while [ $x -lt $num_iters ]; do nnet3-xvector-train $parallel_train_opts --print-interval=10 \ --max-param-change=$max_param_change \ $dir/$x.raw \ - "ark:nnet3-copy-egs ark:$egs_dir/egs.$archive.ark ark:- | nnet3-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-| nnet3-xvector-merge-egs --minibatch-size=$minibatch_size --discard-partial-minibatches=true ark:- ark:- |" \ + "ark:nnet3-copy-egs ark:$egs_dir/egs.$archive.ark ark:- | nnet3-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-| nnet3-merge-egs --measure-output-frames=false --minibatch-size=$minibatch_size --discard-partial-minibatches=true ark:- ark:- |" \ $dir/$[$x+1].$n.raw || touch $dir/.error & done wait diff --git a/src/nnet3bin/nnet3-merge-egs.cc b/src/nnet3bin/nnet3-merge-egs.cc index 8627671f53a..fe528486238 100644 --- a/src/nnet3bin/nnet3-merge-egs.cc +++ b/src/nnet3bin/nnet3-merge-egs.cc @@ -58,7 +58,7 @@ int main(int argc, char *argv[]) { "e.g.\n" "nnet3-merge-egs --minibatch-size=512 ark:1.egs ark:- | nnet3-train-simple ... \n" "See also nnet3-copy-egs\n"; - + bool compress = false; int32 minibatch_size = 512; bool measure_output_frames = true; @@ -76,7 +76,7 @@ int main(int argc, char *argv[]) { po.Register("discard-partial-minibatches", &discard_partial_minibatches, "discard any partial minibatches of 'uneven' size that may be " "encountered at the end."); - + po.Read(argc, argv); if (po.NumArgs() != 2) { @@ -89,18 +89,19 @@ int main(int argc, char *argv[]) { SequentialNnetExampleReader example_reader(examples_rspecifier); NnetExampleWriter example_writer(examples_wspecifier); - + std::vector examples; examples.reserve(minibatch_size); int32 cur_num_output_frames = 0; - + int64 num_read = 0, num_written = 0; while (!example_reader.Done()) { const NnetExample &cur_eg = example_reader.Value(); examples.resize(examples.size() + 1); examples.back() = cur_eg; - cur_num_output_frames += NumOutputIndexes(cur_eg); + if (measure_output_frames) + cur_num_output_frames += NumOutputIndexes(cur_eg); bool minibatch_ready = (measure_output_frames ? cur_num_output_frames >= minibatch_size : diff --git a/src/xvectorbin/Makefile b/src/xvectorbin/Makefile index e0703ab8422..1dc1bee6e0a 100644 --- a/src/xvectorbin/Makefile +++ b/src/xvectorbin/Makefile @@ -7,8 +7,7 @@ LDFLAGS += $(CUDA_LDFLAGS) LDLIBS += $(CUDA_LDLIBS) BINFILES = nnet3-xvector-get-egs nnet3-xvector-compute-prob \ - nnet3-xvector-show-progress nnet3-xvector-train \ - nnet3-xvector-merge-egs + nnet3-xvector-show-progress nnet3-xvector-train OBJFILES = diff --git a/src/xvectorbin/nnet3-xvector-merge-egs.cc b/src/xvectorbin/nnet3-xvector-merge-egs.cc deleted file mode 100644 index 28dc9d2ee18..00000000000 --- a/src/xvectorbin/nnet3-xvector-merge-egs.cc +++ /dev/null @@ -1,108 +0,0 @@ -// xvectorbin/nnet3-xvector-merge-egs.cc - -// Copyright 2016 David Snyder -// 2012-2015 Johns Hopkins University (author: Daniel Povey) -// 2014 Vimal Manohar - -// See ../../COPYING for clarification regarding multiple authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED -// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, -// MERCHANTABLITY OR NON-INFRINGEMENT. -// See the Apache 2 License for the specific language governing permissions and -// limitations under the License. - -#include "base/kaldi-common.h" -#include "util/common-utils.h" -#include "nnet3/nnet-example.h" -#include "nnet3/nnet-example-utils.h" - -int main(int argc, char *argv[]) { - try { - using namespace kaldi; - using namespace kaldi::nnet3; - typedef kaldi::int32 int32; - - const char *usage = - "This copies nnet examples for xvector training from input to\n" - "output but while doing so it merges many NnetExample objects\n" - "into one, forming a minibatch consisting of single NnetExample.\n" - "Unlike nnet3-merge-egs, this binary does not expect the examples\n" - "to have any output.\n" - "\n" - "Usage: nnet3-xvector-merge-egs [options] " - "\n" - "e.g.\n" - "nnet3-xvector-merge-egs --minibatch-size=512 ark:1.egs ark:- " - "| nnet3-xvector-train ... \n" - "See also nnet3-copy-egs and nnet3-merge-egs\n"; - - bool compress = false; - int32 minibatch_size = 512; - bool discard_partial_minibatches = false; - - ParseOptions po(usage); - po.Register("minibatch-size", &minibatch_size, "Target size of " - "minibatches when merging."); - po.Register("compress", &compress, "If true, compress the output examples " - "(not recommended unless you are writing to disk)"); - po.Register("discard-partial-minibatches", &discard_partial_minibatches, - "discard any partial minibatches of 'uneven' size that may be " - "encountered at the end."); - - po.Read(argc, argv); - - if (po.NumArgs() != 2) { - po.PrintUsage(); - exit(1); - } - - std::string examples_rspecifier = po.GetArg(1), - examples_wspecifier = po.GetArg(2); - - SequentialNnetExampleReader example_reader(examples_rspecifier); - NnetExampleWriter example_writer(examples_wspecifier); - - std::vector examples; - examples.reserve(minibatch_size); - - int32 num_read = 0, num_written = 0; - while (!example_reader.Done()) { - const NnetExample &cur_eg = example_reader.Value(); - examples.resize(examples.size() + 1); - examples.back() = cur_eg; - bool minibatch_ready = - static_cast(examples.size()) >= minibatch_size; - - // Do Next() now, so we can test example_reader.Done() below . - example_reader.Next(); - num_read++; - - if (minibatch_ready || (!discard_partial_minibatches && - (example_reader.Done() && !examples.empty()))) { - NnetExample merged_eg; - MergeExamples(examples, compress, &merged_eg); - std::ostringstream ostr; - ostr << "merged-" << num_written; - num_written++; - std::string output_key = ostr.str(); - example_writer.Write(output_key, merged_eg); - examples.clear(); - } - } - KALDI_LOG << "Merged " << num_read << " egs to " << num_written << '.'; - return (num_written != 0 ? 0 : 1); - } catch(const std::exception &e) { - std::cerr << e.what() << '\n'; - return -1; - } -} - - From 0e744ddadf64a064e33e90bf1741bffeb3fa7925 Mon Sep 17 00:00:00 2001 From: David Snyder Date: Mon, 22 Feb 2016 14:17:01 -0500 Subject: [PATCH 31/32] xvector: fix to nnet3-xvector-get-egs.cc and fix to nnet3-xvector-get-egs.cc (still not working though) and removing trailing spaces --- src/xvector/nnet-xvector-training.cc | 26 ++++++++++++++----------- src/xvectorbin/nnet3-xvector-get-egs.cc | 4 ++-- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/src/xvector/nnet-xvector-training.cc b/src/xvector/nnet-xvector-training.cc index 4ef12e5aaca..1dc8e056fd7 100644 --- a/src/xvector/nnet-xvector-training.cc +++ b/src/xvector/nnet-xvector-training.cc @@ -51,7 +51,7 @@ NnetXvectorTrainer::NnetXvectorTrainer(const NnetTrainerOptions &config, KALDI_WARN << "Could not open cached computation. " "Probably this is the first training iteration."; } - } + } } @@ -96,7 +96,7 @@ void NnetXvectorTrainer::Train(const NnetExample &eg) { if (config_.write_cache != "") { Output ko(config_.write_cache, config_.binary_write_cache); compiler_.WriteCache(ko.Stream(), config_.binary_write_cache); - } + } } void NnetXvectorTrainer::ProcessOutputs(NnetComputer *computer) { @@ -104,12 +104,12 @@ void NnetXvectorTrainer::ProcessOutputs(NnetComputer *computer) { if (nnet_->IsOutputNode(node_index)) { BaseFloat tot_weight, tot_objf; bool supply_deriv = true; - // For each xvector output node, we expect two output nodes with name "s" + // For each xvector output node, we expect two output nodes with name "s" // and "b", which store symmetric affine transformation and bias term // for xvector-objective computation. std::string xvector_name = nnet_->GetNodeName(node_index), s_name = "s", b_name = "b"; - if (nnet_->GetNodeIndex(s_name) == -1 || nnet_->GetNodeIndex(b_name) == -1) + if (nnet_->GetNodeIndex(s_name) == -1 || nnet_->GetNodeIndex(b_name) == -1) KALDI_ERR << "The nnet expected to have two output nodes with name s and b."; if (xvector_name != s_name && xvector_name != b_name) { @@ -119,11 +119,11 @@ void NnetXvectorTrainer::ProcessOutputs(NnetComputer *computer) { CuMatrix xvector_deriv(xvector_pairs.NumRows(), xvector_pairs.NumCols(), kUndefined); int32 s_dim = xvector_pairs.NumCols() * (xvector_pairs.NumCols() + 1) / 2; - - // convert CuVector to CuSpMatrix + + // convert CuVector to CuSpMatrix CuSpMatrix xvec_s_sp(s_dim); xvec_s_sp.CopyFromVec(xvec_s.Row(0)); - + CuVector deriv_s(s_dim); BaseFloat xvec_b_val = xvec_b(0,0), deriv_b; ComputeXvectorObjfAndDeriv(xvector_pairs, xvec_s_sp, xvec_b_val, @@ -142,7 +142,7 @@ void NnetXvectorTrainer::ProcessOutputs(NnetComputer *computer) { computer->AcceptOutputDeriv(s_name, &deriv_s_mat); computer->AcceptOutputDeriv(b_name, &deriv_b_mat); } - + objf_info_[xvector_name].UpdateStats(xvector_name, config_.print_interval, num_minibatches_processed_++, tot_weight, tot_objf); @@ -221,7 +221,7 @@ bool ObjectiveFunctionInfo::PrintTotalStats(const std::string &name) const { << (tot_objf / tot_weight) << " over " << tot_weight << " frames."; } else { KALDI_LOG << "Overall average objective function for '" << name << "' is " - << objf << " + " << aux_objf << " = " << sum_objf + << objf << " + " << aux_objf << " = " << sum_objf << " over " << tot_weight << " frames."; } KALDI_LOG << "[this line is to be parsed by a script:] " @@ -245,33 +245,36 @@ void GetComputationRequestXvector(const Nnet &nnet, request->outputs.reserve(eg.io.size()); request->need_model_derivative = need_model_derivative; request->store_component_stats = store_component_stats; + // xvector-egs have multiple inputs(e.g. different inputs correspond // to different chunks and no outputs. for (size_t i = 0; i < eg.io.size(); i++) { const NnetIo &io = eg.io[i]; const std::string &name = io.name; int32 node_index = nnet.GetNodeIndex(name); + if (node_index == -1 && !nnet.IsInputNode(node_index)) KALDI_ERR << "xvector example has input named '" << name << "', but no such input node is in the network."; std::vector &dest = request->inputs; - // nnet.IsInputNode(node_index) ? request->inputs : request->outputs; dest.resize(dest.size() + 1); IoSpecification &io_spec = dest.back(); io_spec.name = name; io_spec.indexes = io.indexes; io_spec.has_deriv = nnet.IsOutputNode(node_index) && need_model_derivative; } + // We only need the output on frame t=0 for each n. int32 io_index_size = request->inputs[0].indexes.size(); std::vector output_indexes; output_indexes.resize(io_index_size); - for (int32 ind = 0; io_index_size; ind++) { + for (int32 ind = 0; ind < io_index_size; ind++) { output_indexes[ind].n = ind; output_indexes[ind].t = 0; } + // In order to generate computation request for output nodes, // we should find output nodes and add io_spec for each one. int32 num_nodes = nnet.NumNodes(); @@ -285,6 +288,7 @@ void GetComputationRequestXvector(const Nnet &nnet, io_spec.has_deriv = need_model_derivative; } } + // check to see if something went wrong. if (request->inputs.empty()) KALDI_ERR << "No inputs in computation request."; diff --git a/src/xvectorbin/nnet3-xvector-get-egs.cc b/src/xvectorbin/nnet3-xvector-get-egs.cc index fd1dc2e943b..8c9610f9429 100644 --- a/src/xvectorbin/nnet3-xvector-get-egs.cc +++ b/src/xvectorbin/nnet3-xvector-get-egs.cc @@ -110,8 +110,8 @@ static void WriteExamples(const MatrixBase &feats, pair->num_frames1, 0, feat_dim), chunk2(feats, pair->start_frame2 + shift2, pair->num_frames2, 0, feat_dim); - NnetIo nnet_io1 = NnetIo("input1", 0, chunk1), - nnet_io2 = NnetIo("input2", 0, chunk2); + NnetIo nnet_io1 = NnetIo("input", 0, chunk1), + nnet_io2 = NnetIo("input", 0, chunk2); for (std::vector::iterator indx_it = nnet_io1.indexes.begin(); indx_it != nnet_io1.indexes.end(); ++indx_it) indx_it->n = 0; From fbfc27bfb4929f59b0d294c900024671bfca1060 Mon Sep 17 00:00:00 2001 From: David Snyder Date: Mon, 22 Feb 2016 16:18:02 -0500 Subject: [PATCH 32/32] xvector: fix in nnet-xvector-training.cc --- src/xvector/nnet-xvector-diagnostics.cc | 12 ++++++------ src/xvector/nnet-xvector-training.cc | 15 ++++++++++----- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/src/xvector/nnet-xvector-diagnostics.cc b/src/xvector/nnet-xvector-diagnostics.cc index 03a018dc66d..820096118c1 100644 --- a/src/xvector/nnet-xvector-diagnostics.cc +++ b/src/xvector/nnet-xvector-diagnostics.cc @@ -76,11 +76,11 @@ void NnetXvectorComputeProb::Compute(const NnetExample &eg) { } void NnetXvectorComputeProb::ProcessOutputs(NnetComputer *computer) { - for (int32 node_index = 0; node_index < nnet_.NumNodes(); node_index++) { - if (nnet_.IsOutputNode(node_index)) { + for (int32 node_index = 0; node_index < nnet_.NumNodes(); node_index++) { + if (nnet_.IsOutputNode(node_index)) { std::string xvector_name = nnet_.GetNodeName(node_index), s_name = "s", b_name = "b"; - if (nnet_.GetNodeIndex(s_name) == -1 || nnet_.GetNodeIndex(b_name) == -1) + if (nnet_.GetNodeIndex(s_name) == -1 || nnet_.GetNodeIndex(b_name) == -1) KALDI_ERR << "The nnet expected to have two output nodes with name s and b."; if (xvector_name != s_name && xvector_name != b_name) { @@ -90,11 +90,11 @@ void NnetXvectorComputeProb::ProcessOutputs(NnetComputer *computer) { CuMatrix xvector_deriv(xvector_pairs.NumRows(), xvector_pairs.NumCols(), kUndefined); int32 s_dim = xvector_pairs.NumCols() * (xvector_pairs.NumCols() + 1) / 2; - - // convert CuVector to CuSpMatrix + + // convert CuVector to CuSpMatrix CuSpMatrix xvec_s_sp(s_dim); xvec_s_sp.CopyFromVec(xvec_s.Row(0)); - + CuVector deriv_s(s_dim); BaseFloat xvec_b_val = xvec_b(0,0), deriv_b; BaseFloat tot_weight, tot_objf; diff --git a/src/xvector/nnet-xvector-training.cc b/src/xvector/nnet-xvector-training.cc index 1dc8e056fd7..9abd62937f3 100644 --- a/src/xvector/nnet-xvector-training.cc +++ b/src/xvector/nnet-xvector-training.cc @@ -267,12 +267,17 @@ void GetComputationRequestXvector(const Nnet &nnet, } // We only need the output on frame t=0 for each n. - int32 io_index_size = request->inputs[0].indexes.size(); + int32 io_index_size = request->inputs[0].indexes.size(), + n_indx_size = 0; std::vector output_indexes; - output_indexes.resize(io_index_size); - for (int32 ind = 0; ind < io_index_size; ind++) { - output_indexes[ind].n = ind; - output_indexes[ind].t = 0; + for (int32 indx = 0; indx < io_index_size; indx++) + if (request->inputs[0].indexes[indx].t == 0) + n_indx_size++; + + output_indexes.resize(n_indx_size); + for (int32 indx = 0; indx < n_indx_size; indx++) { + output_indexes[indx].n = indx; + output_indexes[indx].t = 0; } // In order to generate computation request for output nodes,