From a9b65137b4ab90845c1357724d5ddaa805972830 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Tue, 9 Feb 2016 19:30:27 -0500
Subject: [PATCH 01/32] fix to validate_lang.pl regarding disambiguation
 symbols, and associated changes to how language-model disambiguation symbols
 are handled to make it easier to add more of them.

---
 egs/wsj/s5/utils/format_lm_sri.sh           |  12 +--
 egs/wsj/s5/utils/lang/add_lex_disambig.pl   |   1 +
 egs/wsj/s5/utils/lang/check_g_properties.pl |  89 ++++++++++++++++
 egs/wsj/s5/utils/lang/prepare_lang.sh       |   1 +
 egs/wsj/s5/utils/lang/validate_lang.pl      |   1 +
 egs/wsj/s5/utils/prepare_lang.sh            |  63 +++++++-----
 egs/wsj/s5/utils/validate_lang.pl           | 108 +++++++++++++++-----
 src/fstbin/fstaddselfloops.cc               |  11 +-
 tools/extras/install_irstlm.sh              |   2 +-
 9 files changed, 225 insertions(+), 63 deletions(-)
 create mode 120000 egs/wsj/s5/utils/lang/add_lex_disambig.pl
 create mode 100755 egs/wsj/s5/utils/lang/check_g_properties.pl
 create mode 120000 egs/wsj/s5/utils/lang/prepare_lang.sh
 create mode 120000 egs/wsj/s5/utils/lang/validate_lang.pl
diff --git a/egs/wsj/s5/utils/format_lm_sri.sh b/egs/wsj/s5/utils/format_lm_sri.sh
index 7753c186045..7b5477e958a 100755
--- a/egs/wsj/s5/utils/format_lm_sri.sh
+++ b/egs/wsj/s5/utils/format_lm_sri.sh
@@ -71,8 +71,8 @@ if [ -z $loc ]; then
     export PATH=$PATH:$sdir:$sdir/..
   else
     echo You appear to not have SRILM tools installed, either on your path,
-    echo or installed in $sdir.  See tools/install_srilm.sh for installation
-    echo instructions.
+    echo or installed in $sdir.  cd to ../../../tools and run
+    echo extras/install_srilm.sh.
     exit 1
   fi
 fi
@@ -88,8 +88,8 @@ lm_base=$(basename $lm '.gz')
 gunzip -c $lm | utils/find_arpa_oovs.pl $out_dir/words.txt \
   > $out_dir/oovs_${lm_base}.txt || exit 1;
 
-# Removing all "illegal" combinations of <s> and </s>, which are supposed to 
-# occur only at being/end of utt.  These can cause determinization failures 
+# Removing all "illegal" combinations of <s> and </s>, which are supposed to
+# occur only at being/end of utt.  These can cause determinization failures
 # of CLG [ends up being epsilon cycles].
 gunzip -c $lm \
   | egrep -v '<s> <s>|</s> <s>|</s> </s>' \
@@ -98,8 +98,8 @@ gunzip -c $lm \
 awk '{print $1}' $out_dir/words.txt > $tmpdir/voc || exit 1;
 
 # Change the LM vocabulary to be the intersection of the current LM vocabulary
-# and the set of words in the pronunciation lexicon. This also renormalizes the 
-# LM by recomputing the backoff weights, and remove those ngrams whose 
+# and the set of words in the pronunciation lexicon. This also renormalizes the
+# LM by recomputing the backoff weights, and remove those ngrams whose
 # probabilities are lower than the backed-off estimates.
 change-lm-vocab -vocab $tmpdir/voc -lm $tmpdir/lm.gz -write-lm $tmpdir/out_lm \
   $srilm_opts || exit 1;
diff --git a/egs/wsj/s5/utils/lang/add_lex_disambig.pl b/egs/wsj/s5/utils/lang/add_lex_disambig.pl
new file mode 120000
index 00000000000..2d1d4425b49
--- /dev/null
+++ b/egs/wsj/s5/utils/lang/add_lex_disambig.pl
@@ -0,0 +1 @@
+../add_lex_disambig.pl
\ No newline at end of file
diff --git a/egs/wsj/s5/utils/lang/check_g_properties.pl b/egs/wsj/s5/utils/lang/check_g_properties.pl
new file mode 100755
index 00000000000..aa0e6eb1c78
--- /dev/null
+++ b/egs/wsj/s5/utils/lang/check_g_properties.pl
@@ -0,0 +1,89 @@
+#!/usr/bin/env perl
+
+use IPC::Open2;
+
+if (@ARGV != 1) {
+  print "Usage: $0 [options] <lang_directory>\n";
+  print "e.g.:  $0 data/lang\n";
+  exit(1);
+}
+
+$lang = shift @ARGV;
+
+# This script checks that G.fst in the lang.fst directory is OK with respect
+# to certain expected properties, and returns nonzero exit status if a problem was
+# detected.  It is called from validate_lang.pl.
+# This only checks the properties of G that relate to disambiguation symbols,
+# epsilons and forbidden symbols <s> and </s>.
+
+if (! -e "$lang/G.fst") {
+  print "$0: error: $lang/G.fst does not exist\n";
+  exit(1);
+}
+
+open(W, "<$lang/words.txt") || die "opening $lang/words.txt";
+$hash_zero = -1;
+while (<W>) {
+  @A = split(" ", $_);
+  ($sym, $int) = @A;
+  if ($sym eq "<s>" || $sym eq "</s>") { $is_forbidden{$int} = 1; }
+  if ($sym eq "#0") { $hash_zero = $int; }
+}
+
+if (-e "$lang/phones/wdisambig_words.int") {
+  open(F, "<$lang/phones/wdisambig_words.int") || die "opening $lang/phones/wdisambig_words.int";
+  while (<F>) {
+    chop;
+    $is_disambig{$_} = 1;
+  }
+} else {
+  $is_disambig{$hash_zero} = 1;
+}
+
+$input_cmd = ". ./path.sh; fstprint $lang/G.fst|";
+open(G, $input_cmd) || die "running command $input_cmd";
+
+$info_cmd = ". ./path.sh; fstcompile | fstinfo ";
+open2(O, I, "$info_cmd") || die "running command $info_cmd";
+
+$has_epsilons = 0;
+
+while (<G>) {
+  @A = split(" ", $_);
+  if (@A >= 4) {
+    if ($is_forbidden{$A[2]} || $is_forbidden{$A[3]}) {
+      chop;
+      print "$0: validating $lang: error: line $_ in G.fst contains forbidden symbol <s> or </s>\n";
+      exit(1);
+    } elsif ($is_disambig{$A[2]}) {
+      print O $_;
+      if ($A[3] != 0) {
+        chop;
+        print "$0: validating $lang: error: line $_ in G.fst has disambig on input but no epsilon on output\n";
+        exit(1);
+      }
+    } elsif ($A[2] == 0) {
+      print O $_;
+      $has_epsilons = 1;
+    } elsif ($A[2] != $A[3]) {
+      chop;
+      print "$0: validating $lang: error: line $_ in G.fst has inputs and outputs different but input is not disambig symbol.\n";
+      exit(1);
+    }
+  }
+}
+
+close(O);  # tell 'fstcompile | fstinfo' pipeline that its input is done.
+while (<I>) {
+  if (m/cyclic\s+/) {
+    print "$0: validating $lang: error: G.fst has cycles containing only disambig symbols and epsilons.  Would cause determinization failure\n";
+    exit(1);
+  }
+}
+
+if ($has_epsilons) {
+  print "$0: warning: validating $lang: G.fst has epsilon-input arcs.  We don't expect these in most setups.\n";
+}
+
+print "--> $0 successfully validated $lang/G.fst\n";
+exit(0);
diff --git a/egs/wsj/s5/utils/lang/prepare_lang.sh b/egs/wsj/s5/utils/lang/prepare_lang.sh
new file mode 120000
index 00000000000..96b9f592e82
--- /dev/null
+++ b/egs/wsj/s5/utils/lang/prepare_lang.sh
@@ -0,0 +1 @@
+../prepare_lang.sh
\ No newline at end of file
diff --git a/egs/wsj/s5/utils/lang/validate_lang.pl b/egs/wsj/s5/utils/lang/validate_lang.pl
new file mode 120000
index 00000000000..edb66bf3149
--- /dev/null
+++ b/egs/wsj/s5/utils/lang/validate_lang.pl
@@ -0,0 +1 @@
+../validate_lang.pl
\ No newline at end of file
diff --git a/egs/wsj/s5/utils/prepare_lang.sh b/egs/wsj/s5/utils/prepare_lang.sh
index 43b8bce1f4c..c8888dbcb8a 100755
--- a/egs/wsj/s5/utils/prepare_lang.sh
+++ b/egs/wsj/s5/utils/prepare_lang.sh
@@ -28,20 +28,21 @@
 # and also files silence_phones.txt, nonsilence_phones.txt, optional_silence.txt
 # and extra_questions.txt
 # Here, silence_phones.txt and nonsilence_phones.txt are lists of silence and
-# non-silence phones respectively (where silence includes various kinds of 
-# noise, laugh, cough, filled pauses etc., and nonsilence phones includes the 
+# non-silence phones respectively (where silence includes various kinds of
+# noise, laugh, cough, filled pauses etc., and nonsilence phones includes the
 # "real" phones.)
-# In each line of those files is a list of phones, and the phones on each line 
-# are assumed to correspond to the same "base phone", i.e. they will be 
+# In each line of those files is a list of phones, and the phones on each line
+# are assumed to correspond to the same "base phone", i.e. they will be
 # different stress or tone variations of the same basic phone.
-# The file "optional_silence.txt" contains just a single phone (typically SIL) 
+# The file "optional_silence.txt" contains just a single phone (typically SIL)
 # which is used for optional silence in the lexicon.
 # extra_questions.txt might be empty; typically will consist of lists of phones,
-# all members of each list with the same stress or tone; and also possibly a 
-# list for the silence phones.  This will augment the automtically generated 
-# questions (note: the automatically generated ones will treat all the 
-# stress/tone versions of a phone the same, so will not "get to ask" about 
+# all members of each list with the same stress or tone; and also possibly a
+# list for the silence phones.  This will augment the automatically generated
+# questions (note: the automatically generated ones will treat all the
+# stress/tone versions of a phone the same, so will not "get to ask" about
 # stress or tone).
+#
 
 # This script adds word-position-dependent phones and constructs a host of other
 # derived files, that go in data/lang/.
@@ -49,19 +50,20 @@
 # Begin configuration section.
 num_sil_states=5
 num_nonsil_states=3
+num_word_disambig_syms=1
 position_dependent_phones=true
-# position_dependent_phones is false also when position dependent phones and word_boundary.txt 
+# position_dependent_phones is false also when position dependent phones and word_boundary.txt
 # have been generated by another source
 reverse=false
-share_silence_phones=false  # if true, then share pdfs of different silence 
+share_silence_phones=false  # if true, then share pdfs of different silence
                             # phones together.
 sil_prob=0.5
 phone_symbol_table=              # if set, use a specified phones.txt file.
 # end configuration sections
 
-. utils/parse_options.sh 
+. utils/parse_options.sh
 
-if [ $# -ne 4 ]; then 
+if [ $# -ne 4 ]; then
   echo "usage: utils/prepare_lang.sh <dict-src-dir> <oov-dict-entry> <tmp-dir> <lang-dir>"
   echo "e.g.: utils/prepare_lang.sh data/local/dict <SPOKEN_NOISE> data/local/lang data/lang"
   echo "<dict-src-dir> should contain the following files:"
@@ -133,10 +135,10 @@ if $position_dependent_phones; then
   # adding the markers _B, _E, _S, _I depending on word position.
   # In this recipe, these markers apply to silence also.
   # Do this starting from lexiconp.txt only.
-  if "$silprob"; then 
+  if "$silprob"; then
     perl -ane '@A=split(" ",$_); $w = shift @A; $p = shift @A; $silword_p = shift @A;
               $wordsil_f = shift @A; $wordnonsil_f = shift @A; @A>0||die;
-         if(@A==1) { print "$w $p $silword_p $wordsil_f $wordnonsil_f $A[0]_S\n"; } 
+         if(@A==1) { print "$w $p $silword_p $wordsil_f $wordnonsil_f $A[0]_S\n"; }
          else { print "$w $p $silword_p $wordsil_f $wordnonsil_f $A[0]_B ";
          for($n=1;$n<@A-1;$n++) { print "$A[$n]_I "; } print "$A[$n]_E\n"; } ' \
                 < $srcdir/lexiconp_silprob.txt > $tmpdir/lexiconp_silprob.txt
@@ -158,11 +160,11 @@ if $position_dependent_phones; then
       mv $tmpdir/lexiconp.pre_reverse $tmpdir/lexiconp.txt
     fi
   fi
-  
+
   # create $tmpdir/phone_map.txt
   # this has the format (on each line)
   # <original phone> <version 1 of original phone> <version 2> ...
-  # where the versions depend on the position of the phone within a word. 
+  # where the versions depend on the position of the phone within a word.
   # For instance, we'd have:
   # AA AA_B AA_E AA_I AA_S
   # for (B)egin, (E)nd, (I)nternal and (S)ingleton
@@ -178,7 +180,7 @@ if $position_dependent_phones; then
     <(for x in `cat $srcdir/nonsilence_phones.txt`; do for y in "" "_B" "_E" "_I" "_S"; do echo -n "$x$y "; done; echo; done) \
     > $tmpdir/phone_map.txt
 else
-  if "$silprob"; then 
+  if "$silprob"; then
     cp $srcdir/lexiconp_silprob.txt $tmpdir/lexiconp_silprob.txt
     if $reverse; then
       echo "We do not support reverse option and silprob at the same time"
@@ -277,7 +279,7 @@ if [[ ! -z $phone_symbol_table ]]; then
   start_symbol=`grep \#0 $phone_symbol_table | awk '{print $2}'`
   echo "<eps>" | cat - $dir/phones/{silence,nonsilence}.txt | awk -v f=$phone_symbol_table '
   BEGIN { while ((getline < f) > 0) { phones[$1] = $2; }} { print $1" "phones[$1]; }' | sort -k2 -g |\
-    cat - <(cat $dir/phones/disambig.txt | awk -v x=$start_symbol '{n=x+NR-1; print $1, n;}') > $dir/phones.txt 
+    cat - <(cat $dir/phones/disambig.txt | awk -v x=$start_symbol '{n=x+NR-1; print $1, n;}') > $dir/phones.txt
 else
   echo "<eps>" | cat - $dir/phones/{silence,nonsilence,disambig}.txt | \
     awk '{n=NR-1; print $1, n;}' > $dir/phones.txt
@@ -313,7 +315,7 @@ fi
 cat $tmpdir/lexiconp.txt | awk '{print $1}' | sort | uniq  | awk '
   BEGIN {
     print "<eps> 0";
-  } 
+  }
   {
     if ($1 == "<s>") {
       print "<s> is in the vocabulary!" | "cat 1>&2"
@@ -362,7 +364,7 @@ cat $dir/phones/align_lexicon.txt | utils/sym2int.pl -f 3- $dir/phones.txt | \
   utils/sym2int.pl -f 1-2 $dir/words.txt > $dir/phones/align_lexicon.int
 
 # Create the basic L.fst without disambiguation symbols, for use
-# in training. 
+# in training.
 
 if $silprob; then
   # Usually it's the same as having a fixed-prob L.fst
@@ -386,7 +388,18 @@ cat $dir/oov.txt | utils/sym2int.pl $dir/words.txt >$dir/oov.int || exit 1;
 # integer version of oov symbol, used in some scripts.
 
 
-# Create these lists of phones in colon-separated integer list form too, 
+# the file wdisambig.txt contains a (line-by-line) list of the text-form of the
+# disambiguation symbols that are used in the grammar and passed through by the
+# lexicon.  At this stage it's hardcoded as '#0', but we're laying the groundwork
+# for more generality (which probably would be added by another script).
+# wdisambig_words.int contains the corresponding list interpreted by the
+# symbol table words.txt, and wdisambig_phones.int contains the corresponding
+# list interpreted by the symbol table phones.txt.
+echo '#0' >$dir/phones/wdisambig.txt
+utils/sym2int.pl $dir/phones.txt <$dir/phones/wdisambig.txt >$dir/phones/wdisambig_phones.int
+utils/sym2int.pl $dir/words.txt <$dir/phones/wdisambig.txt >$dir/phones/wdisambig_words.int
+
+# Create these lists of phones in colon-separated integer list form too,
 # for purposes of being given to programs as command-line options.
 for f in silence nonsilence optional_silence disambig context_indep; do
   utils/sym2int.pl $dir/phones.txt <$dir/phones/$f.txt >$dir/phones/$f.int
@@ -415,20 +428,18 @@ utils/gen_topo.pl $num_nonsil_states $num_sil_states $nonsilphonelist $silphonel
 # Create the lexicon FST with disambiguation symbols, and put it in lang_test.
 # There is an extra step where we create a loop to "pass through" the
 # disambiguation symbols from G.fst.
-phone_disambig_symbol=`grep \#0 $dir/phones.txt | awk '{print $2}'`
-word_disambig_symbol=`grep \#0 $dir/words.txt | awk '{print $2}'`
 
 if $silprob; then
   utils/make_lexicon_fst_silprob.pl $tmpdir/lexiconp_silprob_disambig.txt $srcdir/silprob.txt $silphone '#'$ndisambig | \
      fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \
      --keep_isymbols=false --keep_osymbols=false |   \
-     fstaddselfloops  "echo $phone_disambig_symbol |" "echo $word_disambig_symbol |" | \
+     fstaddselfloops  $dir/phones/wdisambig_phones.int $dir/phones/wdisambig_words.int | \
      fstarcsort --sort_type=olabel > $dir/L_disambig.fst || exit 1;
 else
   utils/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp_disambig.txt $sil_prob $silphone '#'$ndisambig | \
      fstcompile --isymbols=$dir/phones.txt --osymbols=$dir/words.txt \
      --keep_isymbols=false --keep_osymbols=false |   \
-     fstaddselfloops  "echo $phone_disambig_symbol |" "echo $word_disambig_symbol |" | \
+     fstaddselfloops  $dir/phones/wdisambig_phones.int $dir/phones/wdisambig_words.int | \
      fstarcsort --sort_type=olabel > $dir/L_disambig.fst || exit 1;
 fi
 
diff --git a/egs/wsj/s5/utils/validate_lang.pl b/egs/wsj/s5/utils/validate_lang.pl
index ae087bd9578..415d06a4aaf 100755
--- a/egs/wsj/s5/utils/validate_lang.pl
+++ b/egs/wsj/s5/utils/validate_lang.pl
@@ -89,15 +89,7 @@
     $wint2sym{$wsymtab{$_}} = $_;
   }
 }
-if (exists $wsymtab{"#0"}) {
-  print "--> $lang/words.txt has \"#0\"\n";
-  print "--> $lang/words.txt is OK\n";
-} else {
-  $warning = 1;
-  print "--> WARNING: $lang/words.txt doesn't have \"#0\"\n";
-  print "-->          (if you are using ARPA-type language models, you will normally\n";
-  print "-->           need the disambiguation symbol \"#0\" to ensure determinizability)\n";
-}
+print "--> $lang/words.txt is OK\n";
 print "\n";
 
 # Checking phones/* -------------------------------
@@ -113,7 +105,6 @@ sub check_txt_int_csl {
   if (!open(CSL, "<$cat.csl")) {
     $exit = 1; return print "--> ERROR: fail to open $cat.csl\n";
   }
-
   if (-z "$cat.txt") {
     $warning = 1; print "--> WARNING: $cat.txt is empty\n";
   }
@@ -743,6 +734,77 @@ sub check_summation {
   }
 }
 
+sub check_wdisambig {
+  print "Checking word-level disambiguation symbols...\n";
+  # This block checks that one of the two following conditions hold:
+  # (1) for lang diretories prepared by older versions of prepare_lang.sh:
+  #  The symbol  '#0' should appear in words.txt and phones.txt, and should
+  # or (2): the files wdisambig.txt, wdisambig_phones.int and wdisambig_words.int
+  #  exist, and have the expected properties (see below for details).
+  my %wdisambig_words_hash;
+  my %wdisambig_words_string = "";
+
+  if (! -e "$lang/phones/wdisambig.txt") {
+    print "--> no $lang/phones/wdisambig.txt (older prepare_lang.sh)\n";
+    if (exists $wsymtab{"#0"}) {
+      print "--> $lang/words.txt has \"#0\"\n";
+      $wdisambig_words_hash{$wsymtab{"#0"}} = 1;
+      $wdisambig_words_string = $wsymtab{"#0"};
+    } else {
+      print "--> ERROR: $lang/words.txt doesn't have \"#0\"\n";
+      print "-->          (if you are using ARPA-type language models, you will normally\n";
+      print "-->           need the disambiguation symbol \"#0\" to ensure determinizability)\n";
+      $exit = 1;
+    }
+  } else {
+   print "--> $lang/phones/wdisambig.txt exists (newer prepare_lang.sh)\n";
+    if (!open(T, "<$lang/phones/wdisambig.txt")) {
+      print "--> ERROR: fail to open $lang/phones/wdisambig.txt\n"; $exit = 1; return;
+    }
+    chomp(my @wdisambig = <T>);
+    close(T);
+    if (!open(W, "<$lang/phones/wdisambig_words.int")) {
+      print "--> ERROR: fail to open $lang/phones/wdisambig_words.int\n"; $exit = 1; return;
+    }
+    chomp(my @wdisambig_words = <W>);
+    close(W);
+    if (!open(P, "<$lang/phones/wdisambig_phones.int")) {
+      print "--> ERROR: fail to open $lang/phones/wdisambig_phones.int\n"; $exit = 1; return;
+    }
+    chomp(my @wdisambig_phones = <P>);
+    close(P);
+    my $len = @wdisambig, $len2;
+    if (($len2 = @wdisambig_words) != $len) {
+      print "--> ERROR: files $lang/phones/wdisambig.txt and $lang/phones/wdisambig_words.int have different lengths";
+      $exit = 1; return;
+    }
+   if (($len2 = @wdisambig_phones) != $len) {
+      print "--> ERROR: files $lang/phones/wdisambig.txt and $lang/phones/wdisambig_phones.int have different lengths";
+      $exit = 1; return;
+    }
+    for (my $i = 0; $i < $len; $i++) {
+      if ($wsymtab{$wdisambig[$i]} ne $wdisambig_words[$i]) {
+        my $ii = $i + 1;
+        print "--> ERROR: line $ii of files $lang/phones/wdisambig.txt and $lang/phones/wdisambig_words.int mismatch\n";
+        $exit = 1; return;
+      }
+    }
+    for (my $i = 0; $i < $len; $i++) {
+      if ($psymtab{$wdisambig[$i]} ne $wdisambig_phones[$i]) {
+        my $ii = $i + 1;
+        print "--> ERROR: line $ii of files $lang/phones/wdisambig.txt and $lang/phones/wdisambig_phones.int mismatch\n";
+        $exit = 1; return;
+      }
+    }
+    foreach my $i ( @wdisambig_words ) {
+      $wdisambig_words_hash{$i} = 1;
+      $wdisambig_words_string .= " " . $i;
+    }
+ }
+}
+
+check_wdisambig();
+
 if (-e "$lang/G.fst") {
   # Check that G.fst is ilabel sorted and nonempty.
   $text = `. ./path.sh; fstinfo $lang/G.fst`;
@@ -781,21 +843,17 @@ sub check_summation {
   }
 
   # Check that G.fst does not have cycles with only disambiguation symbols or
-  # epsilons on the input, or the forbidden symbols <s> and </s>.
-  $cmd = ". ./path.sh; fstprint $lang/G.fst | awk -v disambig=$lang/phones/disambig.int -v words=$lang/words.txt 'BEGIN{while((getline<disambig)>0) is_disambig[\$1]=1; is_disambig[0] = 1; while((getline<words)>0){ if(\$1==\"<s>\"||\$1==\"</s>\") is_forbidden[\$2]=1;}} {if(NF<3 || is_disambig[\$3]) print; else if(is_forbidden[\$3] || is_forbidden[\$4]) { print \"Error: line \" \$0 \" in G.fst contains forbidden symbol <s> or </s>\" | \"cat 1>&2\"; exit(1); }}' | fstcompile | fstinfo ";
-  $output = `$cmd`;
-  if ($output !~ m/# of states\s+[1-9]/) { # fstinfo did not read a nonempty FST (there should be final probs at least)...
-    print "--> ERROR: failure running command to check for disambig-sym loops [possibly G.fst " .
-         "contained the forbidden symbols <s> or </s>, or possibly some other error..  Output was: \n";
-    print $output;
-    $exit = 1;
-  }
-  if ($output !~ m/cyclic\s+n/) { # FST was cyclic after selecting only for disambig symbols.   This is now allowed.
-    print "--> ERROR: G.fst contained cycles with only disambiguation symbols or epsilons on the input.  Would cause determinization failure in graph creation.\n";
-    $exit = 1;
-  } else {
-    print "--> G.fst did not contain cycles with only disambig symbols or epsilon on the input, and did not contain\n" .
-      "the forbidden symbols <s> or </s> (if present in vocab) on the input or output.\n";
+  # epsilons on the input, or the forbidden symbols <s> and </s> (and a few
+  # related checks
+
+  if (-e "$lang/G.fst") {
+    system("utils/lang/check_g_properties.pl $lang");
+    if ($? != 0) {
+      print "--> ERROR: failure running check_g_properties.pl\n";
+      $exit = 1;
+    } else {
+      print("--> utils/lang/check_g_properties.pl succeeded.\n");
+    }
   }
 }
 
diff --git a/src/fstbin/fstaddselfloops.cc b/src/fstbin/fstaddselfloops.cc
index 9219093bee1..96895f23cf4 100644
--- a/src/fstbin/fstaddselfloops.cc
+++ b/src/fstbin/fstaddselfloops.cc
@@ -45,8 +45,9 @@ int main(int argc, char *argv[]) {
         "on at least one arc out of the state.  Useful in conjunction with predeterminize\n"
         "\n"
         "Usage:  fstaddselfloops in-disambig-list out-disambig-list  [in.fst [out.fst] ]\n"
-        "E.g:  fstaddselfloops in.list out.list < in.fst > withloops.fst\n";
-
+        "E.g:  fstaddselfloops in.list out.list < in.fst > withloops.fst\n"
+        "in.list and out.list are lists of integers, one per line, of the\n"
+        "same length.\n";
 
     ParseOptions po(usage);
     po.Read(argc, argv);
@@ -62,12 +63,12 @@ int main(int argc, char *argv[]) {
         fst_out_filename = po.GetOptArg(4);
 
     VectorFst<StdArc> *fst = ReadFstKaldi(fst_in_filename);
-    
+
     std::vector<int32> disambig_in;
     if (!ReadIntegerVectorSimple(disambig_in_rxfilename, &disambig_in))
       KALDI_ERR << "fstaddselfloops: Could not read disambiguation symbols from "
                  << kaldi::PrintableRxfilename(disambig_in_rxfilename);
-    
+
     std::vector<int32> disambig_out;
     if (!ReadIntegerVectorSimple(disambig_out_rxfilename, &disambig_out))
       KALDI_ERR << "fstaddselfloops: Could not read disambiguation symbols from "
@@ -81,7 +82,7 @@ int main(int argc, char *argv[]) {
     WriteFstKaldi(*fst, fst_out_filename);
 
     delete fst;
-    
+
     return 0;
   } catch(const std::exception &e) {
     std::cerr << e.what();
diff --git a/tools/extras/install_irstlm.sh b/tools/extras/install_irstlm.sh
index 680e37a60b4..53ac392ac0e 100755
--- a/tools/extras/install_irstlm.sh
+++ b/tools/extras/install_irstlm.sh
@@ -55,7 +55,7 @@ fi
   [ ! -z ${IRSTLM} ] && \
     echo >&2 "IRSTLM config is already in env.sh" && exit
 
-  wd=`readlink -f $wd || pwd`
+  wd=`readlink -f $wd 2>/dev/null || pwd`
 
   echo "export IRSTLM=$wd/irstlm"
   echo "export PATH=\${PATH}:\${IRSTLM}/bin"

From 117c075076ff9c8b0589913d8cc8770901313ba9 Mon Sep 17 00:00:00 2001
From: vesis84 <vesis84@gmail.com>
Date: Thu, 11 Feb 2016 17:39:46 +0100
Subject: [PATCH 02/32] nnet1: added the removal of frames with 0 confidence
 from mini-batches

---
 src/nnetbin/nnet-train-frmshuff.cc | 33 ++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/src/nnetbin/nnet-train-frmshuff.cc b/src/nnetbin/nnet-train-frmshuff.cc
index 8cc065add4f..1d804f971c0 100644
--- a/src/nnetbin/nnet-train-frmshuff.cc
+++ b/src/nnetbin/nnet-train-frmshuff.cc
@@ -226,6 +226,39 @@ int main(int argc, char *argv[]) {
         // apply optional feature transform
         nnet_transf.Feedforward(CuMatrix<BaseFloat>(mat), &feats_transf);
 
+        // remove frames with '0' weight from training,
+        {
+          // are there frames to be removed?
+          if (!weights.Min() > 0.0) { 
+            // create vector with frame-indices to keep,
+            std::vector<MatrixIndexT> keep_frames;
+            for (int32 i=0; i<weights.Dim(); i++) {
+              if (weights(i) > 0.0) 
+                keep_frames.push_back(i);
+            }
+            if (keep_frames.size() == 0) continue; // all frames removed, skip sentence,
+ 
+            // filter feature-frames,
+            CuMatrix<BaseFloat> tmp_feats(keep_frames.size(), feats_transf.NumCols());
+            tmp_feats.CopyRows(feats_transf, CuArray<MatrixIndexT>(keep_frames));
+            tmp_feats.Swap(&feats_transf);
+
+            // filter targets,
+            Posterior tmp_targets;
+            for (int32 i=0; i<keep_frames.size(); i++) {
+              tmp_targets.push_back(targets[keep_frames[i]]);
+            }
+            tmp_targets.swap(targets);
+
+            // filter weights,
+            Vector<BaseFloat> tmp_weights(keep_frames.size());
+            for (int32 i=0; i<keep_frames.size(); i++) {
+              tmp_weights(i) = weights(keep_frames[i]);
+            }
+            tmp_weights.Swap(&weights);
+          }
+        }
+
         // pass data to randomizers
         KALDI_ASSERT(feats_transf.NumRows() == targets.size());
         feature_randomizer.AddData(feats_transf);

From 3651ca8c939230b6d58cec44fa592e2b7d2878d9 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Fri, 12 Feb 2016 17:16:24 -0500
Subject: [PATCH 03/32] some cosmetic improvements to slurm.pl and to the
 fisher_callhome_spanish recipe

---
 .../s5/local/fsp_data_prep.sh                 | 40 +++++------
 egs/fisher_callhome_spanish/s5/run.sh         | 22 +++---
 egs/wsj/s5/utils/slurm.pl                     | 69 ++++++++++---------
 3 files changed, 68 insertions(+), 63 deletions(-)

diff --git a/egs/fisher_callhome_spanish/s5/local/fsp_data_prep.sh b/egs/fisher_callhome_spanish/s5/local/fsp_data_prep.sh
index f453ab42058..8fe80b46784 100755
--- a/egs/fisher_callhome_spanish/s5/local/fsp_data_prep.sh
+++ b/egs/fisher_callhome_spanish/s5/local/fsp_data_prep.sh
@@ -1,13 +1,13 @@
 #!/bin/bash
 #
 # Copyright 2014  Gaurav Kumar.   Apache 2.0
-# The input is the Fisher Dataset which contains DISC1 and DISC2. (*.sph files) 
-# In addition the transcripts are needed as well. 
+# The input is the Fisher Dataset which contains DISC1 and DISC2. (*.sph files)
+# In addition the transcripts are needed as well.
 # To be run from one directory above this script.
 
 # Note: when creating your own data preparation scripts, it's a good idea
 # to make sure that the speaker id (if present) is a prefix of the utterance
-# id, that the output scp file is sorted on utterance id, and that the 
+# id, that the output scp file is sorted on utterance id, and that the
 # transcription file is exactly the same length as the scp file and is also
 # sorted on utterance id (missing transcriptions should be removed from the
 # scp file using e.g. scripts/filter_scp.pl)
@@ -18,8 +18,8 @@ export LC_ALL=C
 
 
 if [ $# -lt 2 ]; then
-   echo "Arguments should be the location of the Spanish Fisher Speech and Transcript Directories, se
-e ../run.sh for example."
+   echo "Usage: $0 <LDC2010S01-location> <LDC2010T04-location>"
+   echo "e.g.: $0 /home/mpost/data/LDC/LDC2010S01 /home/mpost/data/LDC/LDC2010T04"
    exit 1;
 fi
 
@@ -72,20 +72,20 @@ fi
 
 speech_d1=$dir/links/LDC2010S01/DISC1/data/speech
 speech_d2=$dir/links/LDC2010S01/DISC2/data/speech
-transcripts=$dir/links/LDC2010T04/data/transcripts                                 
-                                                                                   
-fcount_d1=`find ${speech_d1} -iname '*.sph' | wc -l`                                             
-fcount_d2=`find ${speech_d2} -iname '*.sph' | wc -l`                                             
-fcount_t=`find ${transcripts} -iname '*.tdf' | wc -l`                                            
-#TODO:it seems like not all speech files have transcripts             
+transcripts=$dir/links/LDC2010T04/data/transcripts
+
+fcount_d1=`find ${speech_d1} -iname '*.sph' | wc -l`
+fcount_d2=`find ${speech_d2} -iname '*.sph' | wc -l`
+fcount_t=`find ${transcripts} -iname '*.tdf' | wc -l`
+#TODO:it seems like not all speech files have transcripts
 #Now check if we got all the files that we needed
-if [ $fcount_d1 != 411 -o $fcount_d2 != 408 -o $fcount_t != 819 ];                 
-then                                                                               
-        echo "Incorrect number of files in the data directories"                   
-        echo "DISC1 and DISC2 should contain 411 and 408 .sph files respectively"  
-        echo "The transcripts should contain 819 files"                            
-        exit 1;                                                                    
-fi   
+if [ $fcount_d1 != 411 -o $fcount_d2 != 408 -o $fcount_t != 819 ];
+then
+        echo "Incorrect number of files in the data directories"
+        echo "DISC1 and DISC2 should contain 411 and 408 .sph files respectively"
+        echo "The transcripts should contain 819 files"
+        exit 1;
+fi
 
 if [ $stage -le 0 ]; then
 	#Gather all the speech files together to create a file list
@@ -105,7 +105,7 @@ if [ $stage -le 1 ]; then
 	mv $tmpdir/reco2file_and_channel $dir/train_all/
 fi
 
-if [ $stage -le 2 ]; then                                                        
+if [ $stage -le 2 ]; then
   sort $tmpdir/text.1 | grep -v '((' | \
   awk '{if (NF > 1){ print; }}' | \
   sed 's:<\s*[/]*\s*\s*for[ei][ei]g[nh]\s*\w*>::g' | \
@@ -149,7 +149,7 @@ if [ $stage -le 3 ]; then
   for f in `cat $tmpdir/train_sph.flist`; do
     # convert to absolute path
     readlink -e $f
-  done > $tmpdir/train_sph_abs.flist  
+  done > $tmpdir/train_sph_abs.flist
 
   cat $tmpdir/train_sph_abs.flist | perl -ane 'm:/([^/]+)\.sph$: || die "bad line $_; ";  print "$1 $_"; ' > $tmpdir/sph.scp
   cat $tmpdir/sph.scp | awk -v sph2pipe=$sph2pipe '{printf("%s-A %s -f wav -p -c 1 %s |\n", $1, sph2pipe, $2); printf("%s-B %s -f wav -p -c 2 %s |\n", $1, sph2pipe, $2);}' | \
diff --git a/egs/fisher_callhome_spanish/s5/run.sh b/egs/fisher_callhome_spanish/s5/run.sh
index 706f3793278..edd7f56bad2 100755
--- a/egs/fisher_callhome_spanish/s5/run.sh
+++ b/egs/fisher_callhome_spanish/s5/run.sh
@@ -17,12 +17,10 @@ set -e
 sfisher_speech=/home/mpost/data/LDC/LDC2010S01
 sfisher_transcripts=/home/mpost/data/LDC/LDC2010T04
 spanish_lexicon=/export/corpora/LDC/LDC96L16
-#split=/export/a04/gkumar/corpora/fishcall/jack-splits/split-matt
 split=local/splits/split_fisher
 
 callhome_speech=/export/corpora/LDC/LDC96S35
 callhome_transcripts=/export/corpora/LDC/LDC96T17
-#split_callhome=/export/a04/gkumar/corpora/fishcall/jack-splits/split-callhome
 split=local/splits/split_callhome
 
 local/fsp_data_prep.sh $sfisher_speech $sfisher_transcripts
@@ -33,16 +31,16 @@ local/fsp_prepare_dict.sh $spanish_lexicon
 
 # Rewrite ----------------------------- This section is no longer needed----
 # At this point, it might make sense to use a bigger lexicon
-# The one I will use is derived from this exercise (spanish fisher) and 
-# the LDC spanish lexicon along with the most frequent words derived from the 
+# The one I will use is derived from this exercise (spanish fisher) and
+# the LDC spanish lexicon along with the most frequent words derived from the
 # gigaword corpus such that the total number of entries in the lexicon
 # are 64k
 
 # To generate the merged lexicon, run
 # /export/a04/gkumar/corpora/gigaword/bin/merge_lexicons.py
 # you might have to set the locations of the three lexicons within this
-# file. Note that the LDC rule base phoneme generator works only from its 
-# own directory. So the merged lexicon is actually created in 
+# file. Note that the LDC rule base phoneme generator works only from its
+# own directory. So the merged lexicon is actually created in
 # /export/a04/gkumar/corpora/LDC9..../spanish_lexicon../lexicon64k
 # This can be easily fixed and will be done. #TODO
 # Also run the clean lexicon script to take care of non stressable vowels
@@ -57,11 +55,11 @@ utils/prepare_lang.sh data/local/dict "<unk>" data/local/lang data/lang
 
 
 # Make sure that you do not use your test and your dev sets to train the LM
-# Some form of cross validation is possible where you decode your dev/set based on an 
+# Some form of cross validation is possible where you decode your dev/set based on an
 # LM that is trained on  everything but that that conversation
 # When in doubt about what your data partitions should be use local/fsp_ideal_data_partitions.pl
-# to get the numbers. Depending on your needs, you might have to change the size of 
-# the splits within that file. The default paritions are based on the Kaldi + Joshua 
+# to get the numbers. Depending on your needs, you might have to change the size of
+# the splits within that file. The default paritions are based on the Kaldi + Joshua
 # requirements which means that I have very large dev and test sets
 local/fsp_train_lms.sh $split
 local/fsp_create_test_lang.sh
@@ -95,7 +93,7 @@ cp -r data/local/data/callhome_train_all data/callhome_train_all
 # MT Tune : Same as the ASR eval set (Use the lattices from here)
 # MT Eval : 20k utterances
 # The dev and the test sets need to be carefully chosen so that there is no conversation/speaker
-# overlap. This has been setup and the script local/fsp_ideal_data_partitions provides the numbers that are needed below. 
+# overlap. This has been setup and the script local/fsp_ideal_data_partitions provides the numbers that are needed below.
 # As noted above, the LM has not been trained on the dev and the test sets.
 #utils/subset_data_dir.sh --first data/train_all 158126 data/dev_and_test
 #utils/subset_data_dir.sh --first data/dev_and_test 37814 data/asr_dev_and_test
@@ -136,7 +134,7 @@ utils/subset_data_dir.sh --shortest data/train 90000 data/train_100kshort
 utils/subset_data_dir.sh  data/train_100kshort 10000 data/train_10k
 local/remove_dup_utts.sh 100 data/train_10k data/train_10k_nodup
 utils/subset_data_dir.sh --speakers data/train 30000 data/train_30k
-utils/subset_data_dir.sh --speakers data/train 90000 data/train_100k  
+utils/subset_data_dir.sh --speakers data/train 90000 data/train_100k
 
 steps/train_mono.sh --nj 10 --cmd "$train_cmd" \
   data/train_10k_nodup data/lang exp/mono0a
@@ -178,7 +176,7 @@ steps/train_lda_mllt.sh --cmd "$train_cmd" \
    exp/tri3a/graph data/dev exp/tri3a/decode_dev || exit 1;
 )&
 
-# Next we'll use fMLLR and train with SAT (i.e. on 
+# Next we'll use fMLLR and train with SAT (i.e. on
 # fMLLR features)
 
 steps/align_fmllr.sh --nj 30 --cmd "$train_cmd" \
diff --git a/egs/wsj/s5/utils/slurm.pl b/egs/wsj/s5/utils/slurm.pl
index 68c269080ac..8095272732e 100755
--- a/egs/wsj/s5/utils/slurm.pl
+++ b/egs/wsj/s5/utils/slurm.pl
@@ -11,7 +11,7 @@
 use Cwd;
 use Getopt::Long;
 
-# slurm.pl was created from the queue.pl 
+# slurm.pl was created from the queue.pl
 # queue.pl has the same functionality as run.pl, except that
 # it runs the job in question on the queue (Sun GridEngine).
 # This version of queue.pl uses the task array functionality
@@ -20,7 +20,7 @@
 
 # The script now supports configuring the queue system using a config file
 # (default in conf/queue.conf; but can be passed specified with --config option)
-# and a set of command line options. 
+# and a set of command line options.
 # The current script handles:
 # 1) Normal configuration arguments
 # For e.g. a command line option of "--gpu 1" could be converted into the option
@@ -30,7 +30,7 @@
 # $0 here in the line is replaced with the argument read from the CLI and the
 # resulting string is passed to qsub.
 # 2) Special arguments to options such as
-# gpu=0 
+# gpu=0
 # If --gpu 0 is given in the command line, then no special "-q" is given.
 # 3) Default argument
 # default gpu=0
@@ -60,7 +60,7 @@
 my $qsub_opts = "";
 my $sync = 0;
 my $num_threads = 1;
-my $max_jobs_run;  
+my $max_jobs_run;
 my $gpu = 0;
 
 my $config = "conf/slurm.conf";
@@ -99,12 +99,12 @@ ()
   print_usage();
 }
 
-for (my $x = 1; $x <= 3; $x++) { # This for-loop is to 
+for (my $x = 1; $x <= 3; $x++) { # This for-loop is to
   # allow the JOB=1:n option to be interleaved with the
   # options to qsub.
   while (@ARGV >= 2 && $ARGV[0] =~ m:^-:) {
     my $switch = shift @ARGV;
-    
+
     if ($switch eq "-V") {
       $qsub_opts .= "-V ";
     } else {
@@ -121,10 +121,10 @@ ()
         $num_threads = $argument2;
       } elsif ($switch =~ m/^--/) { # Config options
         # Convert CLI option to variable name
-        # by removing '--' from the switch and replacing any 
+        # by removing '--' from the switch and replacing any
         # '-' with a '_'
         $switch =~ s/^--//;
-        $switch =~ s/-/_/g;         
+        $switch =~ s/-/_/g;
         $cli_options{$switch} = $argument;
       } else {  # Other qsub options - passed as is
         $qsub_opts .= "$switch $argument ";
@@ -160,7 +160,7 @@ ()
 
 if (exists $cli_options{"config"}) {
   $config = $cli_options{"config"};
-}  
+}
 
 my $default_config_file = <<'EOF';
 # Default configuration
@@ -168,17 +168,18 @@ ()
 option time=* --time $0
 option mem=* --mem-per-cpu $0
 option mem=0          # Do not add anything to qsub_opts
-option num_threads=* --cpus-per-task $0 --ntasks-per-node=1 
+option num_threads=* --cpus-per-task $0 --ntasks-per-node=1
 option num_threads=1 --cpus-per-task 1  --ntasks-per-node=1 # Do not add anything to qsub_opts
-option max_jobs_run=*     # Do nothing
 default gpu=0
 option gpu=0 -p shared
 option gpu=* -p gpu --gres=gpu:$0 --time 4:0:0  # this has to be figured out
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
 EOF
 
 # Here the configuration options specified by the user on the command line
 # (e.g. --mem 2G) are converted to options to the qsub system as defined in
-# the config file. (e.g. if the config file has the line 
+# the config file. (e.g. if the config file has the line
 # "option mem=* -l ram_free=$0,mem_free=$0"
 # and the user has specified '--mem 2G' on the command line, the options
 # passed to queue system would be "-l ram_free=2G,mem_free=2G
@@ -192,7 +193,7 @@ ()
 my %cli_config_options = ();
 my %cli_default_options = ();
 
-if ($opened_config_file == 0 && exists($cli_options{"config"})) {   
+if ($opened_config_file == 0 && exists($cli_options{"config"})) {
   print STDERR "Could not open config file $config\n";
   exit(1);
 } elsif ($opened_config_file == 0 && !exists($cli_options{"config"})) {
@@ -212,12 +213,12 @@ ()
   if ($_ =~ /^command (.+)/) {
     $read_command = 1;
     $qsub_cmd = $1 . " ";
-  } elsif ($_ =~ m/^option ([^=]+)=\* (.+)$/) { 
+  } elsif ($_ =~ m/^option ([^=]+)=\* (.+)$/) {
     # Config option that needs replacement with parameter value read from CLI
     # e.g.: option mem=* -l mem_free=$0,ram_free=$0
     my $option = $1;     # mem
     my $arg= $2;         # -l mem_free=$0,ram_free=$0
-    if ($arg !~ m:\$0:) {  
+    if ($arg !~ m:\$0:) {
       print STDERR "Warning: the line '$line' in config file ($config) does not substitution variable \$0\n";
     }
     if (exists $cli_options{$option}) {
@@ -237,7 +238,7 @@ ()
     }
   } elsif ($_ =~ m/^default (\S+)=(\S+)/) {
     # Default options. Used for setting default values to options i.e. when
-    # the user does not specify the option on the command line 
+    # the user does not specify the option on the command line
     # e.g. default gpu=0
     my $option = $1;  # gpu
     my $value = $2;   # 0
@@ -261,19 +262,25 @@ ()
 
 for my $option (keys %cli_options) {
   if ($option eq "config") { next; }
-  if ($option eq "max_jobs_run" && $array_job != 1) { print STDERR "Ignoring $option\n"; next; }
+
   my $value = $cli_options{$option};
-  
-  if ($option eq "max_jobs_run") { $max_jobs_run = $value; }
 
-  if (exists $cli_default_options{($option,$value)}) {
+  if ($option eq "max_jobs_run") {
+    if ($array_job != 1) {
+      print STDERR "Ignoring $option since this is not an array task.";
+    } else {
+      $max_jobs_run = $value;
+    }
+  } elsif (exists $cli_default_options{($option,$value)}) {
     $qsub_opts .= "$cli_default_options{($option,$value)} ";
   } elsif (exists $cli_config_options{$option}) {
     $qsub_opts .= "$cli_config_options{$option} ";
   } elsif (exists $cli_default_options{($option,"*")}) {
     $qsub_opts .= $cli_default_options{($option,"*")} . " ";
   } else {
-    if ($opened_config_file == 0) { $config = "default config file"; }
+    if ($opened_config_file == 0) {
+      $config = "default config file";
+    }
     die "$0: Command line option $option not described in $config (or value '$value' not allowed)\n";
   }
 }
@@ -301,7 +308,7 @@ ()
 #
 my $cmd = "";
 
-foreach my $x (@ARGV) { 
+foreach my $x (@ARGV) {
   if ($x =~ m/^\S+$/) { $cmd .= $x . " "; } # If string contains no spaces, take
                                             # as-is.
   elsif ($x =~ m:\":) { $cmd .= "'$x' "; } # else if no dbl-quotes, use single
@@ -322,23 +329,23 @@ ()
 # make a directory called "q",
 # where we will put the log created by qsub... normally this doesn't contain
 # anything interesting, evertyhing goes to $logfile.
-if (! -d "$qdir") { 
+if (! -d "$qdir") {
   system "mkdir $qdir 2>/dev/null";
   sleep(5); ## This is to fix an issue we encountered in denominator lattice creation,
   ## where if e.g. the exp/tri2b_denlats/log/15/q directory had just been
   ## created and the job immediately ran, it would die with an error because nfs
   ## had not yet synced.  I'm also decreasing the acdirmin and acdirmax in our
   ## NFS settings to something like 5 seconds.
-} 
+}
 
 my $queue_array_opt = "";
 if ($array_job == 1) { # It's an array job.
   if ($max_jobs_run) {
-      $queue_array_opt = "--array ${jobstart}-${jobend}%${max_jobs_run}"; 
+      $queue_array_opt = "--array ${jobstart}-${jobend}%${max_jobs_run}";
   } else {
-      $queue_array_opt = "--array ${jobstart}-${jobend}"; 
+      $queue_array_opt = "--array ${jobstart}-${jobend}";
   }
-  $logfile =~ s/$jobname/\$SLURM_ARRAY_TASK_ID/g; # This variable will get 
+  $logfile =~ s/$jobname/\$SLURM_ARRAY_TASK_ID/g; # This variable will get
   # replaced by qsub, in each job, with the job-id.
   $cmd =~ s/$jobname/\$\{SLURM_ARRAY_TASK_ID\}/g; # same for the command...
   $queue_logfile =~ s/\.?$jobname//; # the log file in the q/ subdirectory
@@ -475,14 +482,14 @@ ()
         }
       }
 
-      # Check that the job exists in SLURM. Job can be killed if duration 
-      # exceeds some hard limit, or in case of a machine shutdown. 
+      # Check that the job exists in SLURM. Job can be killed if duration
+      # exceeds some hard limit, or in case of a machine shutdown.
       if (($check_sge_job_ctr++ % 10) == 0) { # Don't run qstat too often, avoid stress on SGE.
         if ( -f $f ) { next; }; #syncfile appeared: OK.
         $ret = system("squeue -j $sge_job_id >/dev/null 2>/dev/null");
         # system(...) : To get the actual exit value, shift $ret right by eight bits.
         if ($ret>>8 == 1) {     # Job does not seem to exist
-          # Don't consider immediately missing job as error, first wait some  
+          # Don't consider immediately missing job as error, first wait some
           # time to make sure it is not just delayed creation of the syncfile.
 
           sleep(3);
@@ -546,7 +553,7 @@ ()
   push @logfiles, $logfile;
 } else {
   for (my $jobid = $jobstart; $jobid <= $jobend; $jobid++) {
-    my $l = $logfile; 
+    my $l = $logfile;
     $l =~ s/\$SLURM_ARRAY_TASK_ID/$jobid/g;
     push @logfiles, $l;
   }

From 032aa24b77655c3c7d347be82d262b0787faee57 Mon Sep 17 00:00:00 2001
From: Hossein Hadian <hn.hadian@gmail.com>
Date: Sun, 14 Feb 2016 15:21:00 +0330
Subject: [PATCH 04/32] Minor fix regarding adaptation configs

---
 src/online2/online-gmm-decoding.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/online2/online-gmm-decoding.h b/src/online2/online-gmm-decoding.h
index 41c9ca4c14d..8bec6cd9ab9 100644
--- a/src/online2/online-gmm-decoding.h
+++ b/src/online2/online-gmm-decoding.h
@@ -71,10 +71,10 @@ struct OnlineGmmDecodingAdaptationPolicyConfig {
     opts->Register("adaptation-first-utt-ratio", &adaptation_first_utt_ratio,
                    "Ratio that controls frequency of fMLLR adaptation for first "
                    "utterance of each speaker");
-    opts->Register("adaptation-delay", &adaptation_first_utt_delay,
+    opts->Register("adaptation-delay", &adaptation_delay,
                    "Delay before first basis-fMLLR adaptation for not-first "
                    "utterances of each speaker");
-    opts->Register("adaptation-ratio", &adaptation_first_utt_ratio,
+    opts->Register("adaptation-ratio", &adaptation_ratio,
                    "Ratio that controls frequency of fMLLR adaptation for "
                    "not-first utterances of each speaker");
   }

From 8aa016ad500d3242de3fa373d942b9efe9cc5fff Mon Sep 17 00:00:00 2001
From: Joshua Milas <Josh.Milas@gmail.com>
Date: Sun, 14 Feb 2016 14:14:33 -0500
Subject: [PATCH 05/32] If compiling with MSVS 2015, dont redefine snprintf

---
 src/base/kaldi-utils.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/base/kaldi-utils.cc b/src/base/kaldi-utils.cc
index 13a3412a9bb..1ae1dc0b758 100644
--- a/src/base/kaldi-utils.cc
+++ b/src/base/kaldi-utils.cc
@@ -20,7 +20,9 @@
 #include <Synchapi.h>
 #elif defined(_WIN32) || defined(_MSC_VER) || defined(MINGW)
 #include <Windows.h>
+#if defined(_MSC_VER) && _MSC_VER < 1900
 #define snprintf _snprintf
+#endif /* _MSC_VER < 1900 */
 #else
 #include <unistd.h>
 #endif

From 1fa0e18e715389e232e634ac38050a640344ad8b Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sun, 14 Feb 2016 19:44:20 -0500
Subject: [PATCH 06/32] adding a couple more swbd+chain tuning experiments

---
 egs/swbd/s5c/local/chain/run_tdnn_5w.sh |  10 +
 egs/swbd/s5c/local/chain/run_tdnn_5x.sh | 476 ++++++++++++++++++++++++
 2 files changed, 486 insertions(+)
 create mode 100755 egs/swbd/s5c/local/chain/run_tdnn_5x.sh

diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5w.sh b/egs/swbd/s5c/local/chain/run_tdnn_5w.sh
index e21c3a8b04f..1a40acfa105 100755
--- a/egs/swbd/s5c/local/chain/run_tdnn_5w.sh
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5w.sh
@@ -6,6 +6,16 @@
 # 3500, add --self-repair-scale 0.00001, and reduce --jesus-forward-output-dim
 # from 1800 to 1700.
 
+# Difference is tiny.
+#local/chain/compare_wer.sh 5k 5w
+#System                       5k        5w
+#WER on train_dev(tg)      16.46     16.56
+#WER on train_dev(fg)      15.17     15.30
+#WER on eval2000(tg)        18.1      18.1
+#WER on eval2000(fg)        16.5      16.4
+#Final train prob      -0.105502 -0.106549
+#Final valid prob       -0.12337 -0.120079
+
 # _5k is as _5j (omitting iVectors), and adding a statistics-extraction layer
 # in the middle, like 5e->5g, to see whether it recovers some of the improvement
 # of using the iVectors.
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5x.sh b/egs/swbd/s5c/local/chain/run_tdnn_5x.sh
new file mode 100755
index 00000000000..e50dadfd963
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5x.sh
@@ -0,0 +1,476 @@
+#!/bin/bash
+
+# _5w is as _5x but decreasing the context of the averaging layer from +-0.99
+# seconds to +-0.66 seconds.  I would not have expected this to work a priori,
+# but the change from 5k -> 5l, which made the context wider, made WERs slightly
+# worse, so I'd like to see what happens when we decrease the context.
+
+# It's worse.  Odd because increasing the context (5k->5l) seemed to be a little
+# worse also.
+local/chain/compare_wer.sh 5w 5x
+#System                       5w        5x
+#WER on train_dev(tg)      16.56     16.66
+#WER on train_dev(fg)      15.30     15.41
+#WER on eval2000(tg)        18.1      18.5
+#WER on eval2000(fg)        16.4      16.6
+#Final train prob      -0.106549 -0.105693
+#Final valid prob      -0.120079 -0.121834
+
+# _5w is as _5k (which is a fairly good-performing ivector-free model), but
+# making the same changes as 5e -> 5t, which makes the model more lightweight
+# and faster to train, specifically: reduce --jesus-hidden-dim from 7500 to
+# 3500, add --self-repair-scale 0.00001, and reduce --jesus-forward-output-dim
+# from 1800 to 1700.
+
+# _5k is as _5j (omitting iVectors), and adding a statistics-extraction layer
+# in the middle, like 5e->5g, to see whether it recovers some of the improvement
+# of using the iVectors.
+
+# It recovers half of the improvement-- but the objf is better than
+# we might expect.  I think it's learning some phonetic stuff too.
+#
+#./compare_wer.sh 5e 5j 5k
+#System                       5e        5j        5k
+#WER on train_dev(tg)      15.43     17.59     16.46
+#WER on train_dev(fg)      14.32     16.33     15.17
+#WER on eval2000(tg)        17.3      19.1      18.1
+#WER on eval2000(fg)        15.5      17.5      16.5
+#Final train prob      -0.110056 -0.114691 -0.105502
+#Final valid prob      -0.129184 -0.130761  -0.12337
+
+# The following is decoding with the default frames-per-chunk of 50, and
+# --extra-left-context 20.
+#./compare_wer.sh 5e 5j 5k
+#System                       5e        5j        5k
+#WER on train_dev(tg)      15.43     17.59     17.37
+#WER on train_dev(fg)      14.32     16.33     16.09
+#WER on eval2000(tg)        17.3      19.1      18.8
+#WER on eval2000(fg)        15.5      17.5      17.3
+#Final train prob      -0.110056 -0.114691 -0.105502
+#Final valid prob      -0.129184 -0.130761  -0.12337
+
+# _5j is as _5e, but omitting the iVectors.
+
+# Definitely worse, although curiously, there is very little effect on the valid prob.
+#./compare_wer.sh 5e 5j
+#System                       5e        5j
+#WER on train_dev(tg)      15.43     17.59
+#WER on train_dev(fg)      14.32     16.33
+#WER on eval2000(tg)        17.3      19.1
+#WER on eval2000(fg)        15.5      17.5
+#Final train prob      -0.110056 -0.114691
+#Final valid prob      -0.129184 -0.130761
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_5x # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --egs-dir exp/chain/tdnn_5w_sp/egs \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --jesus-opts "--jesus-forward-input-dim 500  --jesus-forward-output-dim 1700 --jesus-hidden-dim 3500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25 --self-repair-scale 0.00001" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3,mean+stddev(-63:3:9:63) -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --frames-per-chunk 300 \
+          --nj 50 --cmd "$decode_cmd" \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;

From d5b8b237c5e046bfd2091c2ae65e5693d18f20f0 Mon Sep 17 00:00:00 2001
From: Sabine Crevoisier <sabine@speechmatics.com>
Date: Mon, 15 Feb 2016 17:19:12 +0000
Subject: [PATCH 07/32] Modified bash command to avoid wildcard expansion when
 using phones with *.

---
 egs/wsj/s5/utils/prepare_lang.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/egs/wsj/s5/utils/prepare_lang.sh b/egs/wsj/s5/utils/prepare_lang.sh
index 43b8bce1f4c..e451492cc1d 100755
--- a/egs/wsj/s5/utils/prepare_lang.sh
+++ b/egs/wsj/s5/utils/prepare_lang.sh
@@ -174,8 +174,8 @@ if $position_dependent_phones; then
   # This phone map expands the phone lists into all the word-position-dependent
   # versions of the phone lists.
 
-  cat <(for x in `cat $srcdir/silence_phones.txt`; do for y in "" "" "_B" "_E" "_I" "_S"; do echo -n "$x$y "; done; echo; done) \
-    <(for x in `cat $srcdir/nonsilence_phones.txt`; do for y in "" "_B" "_E" "_I" "_S"; do echo -n "$x$y "; done; echo; done) \
+  cat <(cat $srcdir/silence_phones.txt | while read x; do for y in "" "" "_B" "_E" "_I" "_S"; do echo -n "$x$y "; done; echo; done) \
+    <(cat $srcdir/nonsilence_phones.txt | while read x; do for y in "" "_B" "_E" "_I" "_S"; do echo -n "$x$y "; done; echo; done) \
     > $tmpdir/phone_map.txt
 else
   if "$silprob"; then 
@@ -245,10 +245,10 @@ cat $srcdir/extra_questions.txt 2>/dev/null | utils/apply_map.pl $tmpdir/phone_m
 # be inside a word.
 if $position_dependent_phones; then
   for suffix in _B _E _I _S; do
-    (for x in `cat $srcdir/nonsilence_phones.txt`; do echo -n "$x$suffix "; done; echo) >>$dir/phones/extra_questions.txt
+    (cat $srcdir/nonsilence_phones.txt | while read x; do echo -n "$x$suffix "; done; echo) >>$dir/phones/extra_questions.txt
   done
   for suffix in "" _B _E _I _S; do
-    (for x in `cat $srcdir/silence_phones.txt`; do echo -n "$x$suffix "; done; echo) >>$dir/phones/extra_questions.txt
+    (cat $srcdir/silence_phones.txt | while read x; do echo -n "$x$suffix "; done; echo) >>$dir/phones/extra_questions.txt
   done
 fi
 

From 2a862b47419b7b8a2737f162341c4c011a1a12fc Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Mon, 15 Feb 2016 17:16:46 -0500
Subject: [PATCH 08/32] swbd+chain: Add some new example scripts and an
 associated change in the config script to support skip-splicing.

---
 egs/swbd/s5c/local/chain/run_tdnn_5y.sh      | 466 +++++++++++++++++++
 egs/swbd/s5c/local/chain/run_tdnn_5z.sh      | 457 ++++++++++++++++++
 egs/swbd/s5c/local/chain/run_tdnn_6a.sh      | 461 ++++++++++++++++++
 egs/wsj/s5/steps/nnet3/make_jesus_configs.py |  28 +-
 4 files changed, 1403 insertions(+), 9 deletions(-)
 create mode 100755 egs/swbd/s5c/local/chain/run_tdnn_5y.sh
 create mode 100755 egs/swbd/s5c/local/chain/run_tdnn_5z.sh
 create mode 100755 egs/swbd/s5c/local/chain/run_tdnn_6a.sh

diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5y.sh b/egs/swbd/s5c/local/chain/run_tdnn_5y.sh
new file mode 100755
index 00000000000..f89c1f5deac
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5y.sh
@@ -0,0 +1,466 @@
+#!/bin/bash
+
+# _5y is as _5v, but rebalancing the network to have fewer parameters in the
+# final layer and more in the hidden parts, by reducing --final-hidden-dim from 500
+# (it defaults to --jesus-forward-hidden-dim) to 400, and increasing
+#  --jesus-forward-input-dim from 500 to 600 and
+#  --jesus-forward-output-dim from 1700 to 1800,
+# and --jesus-hidden-dim from 2500 to 3000 (note: I don't really expect this last change
+# to make much of a difference).
+# Very roughly, we're moving about a million parameters from the final layer to the
+# hidden parts of the network.  Hopefully this will reduce overtraining, since
+# the hidden parts of the network are regularized by the --xent-regularize option.
+
+# _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500.
+
+# WER is almost the same, perhaps <0.1% worse; diagnostics are slightly worse.
+#
+#local/chain/compare_wer.sh 5e 5s 5t 5v
+#System                       5e        5s        5t        5v
+#WER on train_dev(tg)      15.43     15.47     15.43     15.38
+#WER on train_dev(fg)      14.32     14.31     14.34     14.39
+#WER on eval2000(tg)        17.3      17.4      17.4      17.4
+#WER on eval2000(fg)        15.5      15.6      15.6      15.7
+#Final train prob      -0.110056 -0.110928 -0.110752  -0.11156
+#Final valid prob      -0.129184 -0.132139 -0.129123 -0.131797
+
+# _5t is as _5s but further reducing the jesus-hidden-dim (trying to speed it
+# up), from 5000 to 3500.
+
+# about 5s: comparing with 5e which is the most recent baseline we actually
+# decoded, 5s is as 5e but with jesus-forward-output-dim reduced 1800->1700,
+# jesus-hidden-dim reduced 7500 to 5000, and and the new option
+# --self-repair-scale 0.00001 added.  Also compare 5t and 5v which have even
+# smaller jesus-hidden-dims.
+
+# _5s is as _5r but increasing the jesus-forward-output-dim to the intermediate
+# value of 1700 (between 1500 and 1800), and also a bug-fix in the self-repair
+# code to a bug which was doubling the thresholds so there was, in effect,
+# no upper threshold.  I stopped the p,q,r runs after I found this, but in
+# configuring this run I'm bearing in mind the train and valid probs from the
+# p,q,r runs.
+
+# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000.
+
+# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try
+# to compensate for the fact that more of the output dimensions are now being
+# usefully used.
+
+# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair
+# ReLUs that are over or under-saturated.
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_5y # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 1800 --final-hidden-dim 400 --jesus-hidden-dim 3000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25  --self-repair-scale 0.00001" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5z.sh b/egs/swbd/s5c/local/chain/run_tdnn_5z.sh
new file mode 100755
index 00000000000..0f3e89470d8
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5z.sh
@@ -0,0 +1,457 @@
+#!/bin/bash
+
+# _5z is as _5v, but adding skip-splicing (a new configuration option)
+
+# _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500.
+
+# WER is almost the same, perhaps <0.1% worse; diagnostics are slightly worse.
+#
+#local/chain/compare_wer.sh 5e 5s 5t 5v
+#System                       5e        5s        5t        5v
+#WER on train_dev(tg)      15.43     15.47     15.43     15.38
+#WER on train_dev(fg)      14.32     14.31     14.34     14.39
+#WER on eval2000(tg)        17.3      17.4      17.4      17.4
+#WER on eval2000(fg)        15.5      15.6      15.6      15.7
+#Final train prob      -0.110056 -0.110928 -0.110752  -0.11156
+#Final valid prob      -0.129184 -0.132139 -0.129123 -0.131797
+
+# _5t is as _5s but further reducing the jesus-hidden-dim (trying to speed it
+# up), from 5000 to 3500.
+
+# about 5s: comparing with 5e which is the most recent baseline we actually
+# decoded, 5s is as 5e but with jesus-forward-output-dim reduced 1800->1700,
+# jesus-hidden-dim reduced 7500 to 5000, and and the new option
+# --self-repair-scale 0.00001 added.  Also compare 5t and 5v which have even
+# smaller jesus-hidden-dims.
+
+# _5s is as _5r but increasing the jesus-forward-output-dim to the intermediate
+# value of 1700 (between 1500 and 1800), and also a bug-fix in the self-repair
+# code to a bug which was doubling the thresholds so there was, in effect,
+# no upper threshold.  I stopped the p,q,r runs after I found this, but in
+# configuring this run I'm bearing in mind the train and valid probs from the
+# p,q,r runs.
+
+# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000.
+
+# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try
+# to compensate for the fact that more of the output dimensions are now being
+# usefully used.
+
+# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair
+# ReLUs that are over or under-saturated.
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_5z # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 500  --jesus-forward-output-dim 1700 --jesus-hidden-dim 2500 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25  --self-repair-scale 0.00001" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3,skip0 -3,0,3,skip0 -3,0,3,skip0 -6,-3,0,skip-3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6a.sh b/egs/swbd/s5c/local/chain/run_tdnn_6a.sh
new file mode 100755
index 00000000000..70bd894f313
--- /dev/null
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6a.sh
@@ -0,0 +1,461 @@
+#!/bin/bash
+
+# _5z is as _5z, but adding the change in configuration that
+# we made from _5v to _5y (moving some parameters from final layer
+# to hidden parts of network)
+
+# _5z is as _5v, but adding skip-splicing (a new configuration option)
+
+# _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500.
+
+# WER is almost the same, perhaps <0.1% worse; diagnostics are slightly worse.
+#
+#local/chain/compare_wer.sh 5e 5s 5t 5v
+#System                       5e        5s        5t        5v
+#WER on train_dev(tg)      15.43     15.47     15.43     15.38
+#WER on train_dev(fg)      14.32     14.31     14.34     14.39
+#WER on eval2000(tg)        17.3      17.4      17.4      17.4
+#WER on eval2000(fg)        15.5      15.6      15.6      15.7
+#Final train prob      -0.110056 -0.110928 -0.110752  -0.11156
+#Final valid prob      -0.129184 -0.132139 -0.129123 -0.131797
+
+# _5t is as _5s but further reducing the jesus-hidden-dim (trying to speed it
+# up), from 5000 to 3500.
+
+# about 5s: comparing with 5e which is the most recent baseline we actually
+# decoded, 5s is as 5e but with jesus-forward-output-dim reduced 1800->1700,
+# jesus-hidden-dim reduced 7500 to 5000, and and the new option
+# --self-repair-scale 0.00001 added.  Also compare 5t and 5v which have even
+# smaller jesus-hidden-dims.
+
+# _5s is as _5r but increasing the jesus-forward-output-dim to the intermediate
+# value of 1700 (between 1500 and 1800), and also a bug-fix in the self-repair
+# code to a bug which was doubling the thresholds so there was, in effect,
+# no upper threshold.  I stopped the p,q,r runs after I found this, but in
+# configuring this run I'm bearing in mind the train and valid probs from the
+# p,q,r runs.
+
+# _5r is as _5q but also reducing --jesus-hidden-dim from 7500 to 5000.
+
+# _5q is as _5p but reducing jesus-forward-output-dim from 1800 to 1500 to try
+# to compensate for the fact that more of the output dimensions are now being
+# usefully used.
+
+# _5p is as _5e but adding (new option) --self-repair-scale 0.00001, to repair
+# ReLUs that are over or under-saturated.
+
+# _5e is as _5b, but reducing --xent-regularize from 0.2 to 0.1 (since based on
+# the results of 4v, 4w and 5c, it looks like 0.1 is better than 0.2 or 0.05).
+
+# The improvement is small but consistent (0.1, 0.1, 0.0, 0.1) and also seen
+# in the train and valid probs.
+#System                       5b        5e
+#WER on train_dev(tg)      15.51     15.43
+#WER on train_dev(fg)      14.39     14.32
+#WER on eval2000(tg)        17.3      17.3
+#WER on eval2000(fg)        15.6      15.5
+#Final train prob      -0.112013 -0.110056
+#Final valid prob      -0.130879 -0.129184
+
+# _5b is as _5a, but adding --leaky-hmm-coefficient 0.1.
+
+# It does seem helpful on average: (-0.35, -0.35, -0.1, 0).
+#./compare_wer.sh 5a 5b
+#System                       5a        5b
+#WER on train_dev(tg)      15.86     15.51
+#WER on train_dev(fg)      14.74     14.39
+#WER on eval2000(tg)        17.4      17.3
+#WER on eval2000(fg)        15.6      15.6
+#Final train prob     -0.0998359 -0.112013
+#Final valid prob      -0.115884 -0.130879
+
+# _5a is as _4w, but increasing jesus-forward-output-dim from 1400 to 1800, and
+# jesus-forward-input-dim from 400 to 500.  Hoping that the cross-entropy regularization
+# will mean that the increased parameters are now helpful.
+
+# _4w is as _4v, but doubling --xent-regularize to 0.2
+
+# _4v is as _4r, but with --xent-regularize 0.1.  Increasing max_param_change
+# from 1.0 to 2.0 because there is a lot of parameter change in the final xent
+# layer, and this limits the rate of change of the other layers.
+
+# _4r is as _4f, but one more hidden layer, and reducing context of existing
+# layers so we can re-use the egs.  Reducing jesus-forward-output-dim slightly
+# from 1500 to 1400.
+
+# This is better than 4f by almost all metrics.
+# ./compare_wer.sh 4f 4r
+# System                       4f        4r
+# WER on train_dev(tg)      16.83     16.50
+# WER on train_dev(fg)      15.73     15.45
+# WER on eval2000(tg)        18.4      18.3
+# WER on eval2000(fg)        16.6      16.7
+# Final train prob      -0.105832 -0.103652
+# Final valid prob      -0.123021 -0.121105
+
+# _4f is as _4e, but halving the regularization from 0.0001 to 0.00005.
+
+# It's even better than 4e, by about 0.3% abs.
+#                        4c    4e      4f
+#  Final valid prob:   -0.1241 -0.1267  -0.1230
+#  Final train prob:   -0.08820 -0.1149 -0.1058
+
+# ./show_wer.sh 4f
+# %WER 16.83 [ 8282 / 49204, 870 ins, 2354 del, 5058 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_tg/wer_10_0.0
+# %WER 15.73 [ 7739 / 49204, 864 ins, 2256 del, 4619 sub ] exp/chain/tdnn_4f_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+# %WER 18.4 | 4459 42989 | 83.5 11.0 5.5 2.0 18.4 56.2 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.6 | 4459 42989 | 85.2 9.7 5.1 1.8 16.6 53.4 | exp/chain/tdnn_4f_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 4e
+# %WER 17.09 [ 8407 / 49204, 923 ins, 2242 del, 5242 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_tg/wer_9_0.0
+# %WER 15.91 [ 7829 / 49204, 932 ins, 2141 del, 4756 sub ] exp/chain/tdnn_4e_sp/decode_train_dev_sw1_fsh_fg/wer_9_0.0
+# %WER 18.5 | 4459 42989 | 83.5 10.8 5.7 2.0 18.5 56.0 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_tg/score_9_0.0/eval2000_hires.ctm.filt.sys
+# %WER 16.9 | 4459 42989 | 84.9 9.8 5.4 1.8 16.9 53.9 | exp/chain/tdnn_4e_sp/decode_eval2000_sw1_fsh_fg/score_9_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _4e is as _4c, but adding the option --l2-regularize 0.0001.
+
+# _4c is as _4a, but using half the --jesus-hidden-dim: 7500 versus 15000.
+
+# _4a is as _3s, but using narrower splice-indexes in the first layer.
+
+# _3s is as _3r but reducing jesus-forward-input-dim from 500 to 400.
+# num-params is quite small now: 5.4 million, vs. 12.1 million in 2y, and 8.8 million in 3p.
+# This of course reduces overtraining.  Results are a bit better than 3p but still
+# not as good as 2y
+
+# ./show_wer.sh 3s
+# %WER 17.88 [ 8799 / 49204, 1006 ins, 2312 del, 5481 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 16.67 [ 8200 / 49204, 982 ins, 2221 del, 4997 sub ] exp/chain/tdnn_3s_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 19.6 | 4459 42989 | 82.8 11.8 5.4 2.4 19.6 57.6 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.6 | 4459 42989 | 84.4 10.1 5.4 2.1 17.6 54.7 | exp/chain/tdnn_3s_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 3p
+# %WER 18.05 [ 8880 / 49204, 966 ins, 2447 del, 5467 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_tg/wer_12_0.0
+# %WER 16.86 [ 8296 / 49204, 967 ins, 2321 del, 5008 sub ] exp/chain/tdnn_3p_sp/decode_train_dev_sw1_fsh_fg/wer_12_0.0
+# %WER 19.8 | 4459 42989 | 82.4 11.5 6.1 2.1 19.8 57.7 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_tg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# %WER 18.2 | 4459 42989 | 83.9 10.5 5.7 2.0 18.2 55.6 | exp/chain/tdnn_3p_sp/decode_eval2000_sw1_fsh_fg/score_11_0.0/eval2000_hires.ctm.filt.sys
+# a03:s5c: ./show_wer.sh 2y
+# %WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+# %WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+# %WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+# %WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+
+# _3r is as _3p but reducing the number of parameters as it seemed to be
+# overtraining (despite already being quite a small model): [600,1800 ->
+# 500,1500].  Also in the interim there was a script change to
+# nnet3/chain/train_tdnn.sh to, on mix-up iters, apply half the max-change.
+# [changing it right now from 1/2 to 1/sqrt(2) which is more consistent
+# with the halving of the minibatch size.]
+
+
+# _3p is the same as 3o, but after a code and script change so we can use
+# natural gradient for the RepeatedAffineComponent.
+# [natural gradient was helpful, based on logs;
+# also made a change to use positive bias for the jesus-component affine parts.]
+
+# _3o is as _3n but filling in the first splice-indexes from -1,2 to -1,0,1,2.
+
+# _3n is as _3d (a non-recurrent setup), but using the more recent scripts that support
+# recurrence, with improvements to the learning of the jesus layers.
+
+# _3g is as _3f but using 100 blocks instead of 200, as in d->e 200 groups was found
+# to be worse.
+# It's maybe a little better than the baseline 2y; and better than 3d [-> I guess recurrence
+# is helpful.]
+#./show_wer.sh 3g
+#%WER 17.05 [ 8387 / 49204, 905 ins, 2386 del, 5096 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.67 [ 7712 / 49204, 882 ins, 2250 del, 4580 sub ] exp/chain/tdnn_3g_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.7 | 4459 42989 | 83.5 11.1 5.3 2.2 18.7 56.2 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 16.8 | 4459 42989 | 85.1 9.9 5.0 2.0 16.8 53.7 | exp/chain/tdnn_3g_sp/decode_eval2000_sw1_fsh_fg/score_10_0.5/eval2000_hires.ctm.filt.sys
+#a03:s5c: ./show_wer.sh 2y
+#%WER 16.99 [ 8358 / 49204, 973 ins, 2193 del, 5192 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_tg/wer_11_0.0
+#%WER 15.86 [ 7803 / 49204, 959 ins, 2105 del, 4739 sub ] exp/chain/tdnn_2y_sp/decode_train_dev_sw1_fsh_fg/wer_11_0.0
+#%WER 18.9 | 4459 42989 | 83.4 11.3 5.3 2.3 18.9 56.3 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.1 10.1 4.8 2.1 17.0 53.5 | exp/chain/tdnn_2y_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+#a03:s5c: ./show_wer.sh 3d
+#%WER 17.35 [ 8539 / 49204, 1023 ins, 2155 del, 5361 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_tg/wer_10_0.0
+#%WER 16.09 [ 7919 / 49204, 1012 ins, 2071 del, 4836 sub ] exp/chain/tdnn_3d_sp/decode_train_dev_sw1_fsh_fg/wer_10_0.0
+#%WER 18.9 | 4459 42989 | 83.2 11.2 5.6 2.1 18.9 56.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_tg/score_10_0.0/eval2000_hires.ctm.filt.sys
+#%WER 17.0 | 4459 42989 | 85.0 9.8 5.2 2.0 17.0 53.6 | exp/chain/tdnn_3d_sp/decode_eval2000_sw1_fsh_fg/score_10_0.0/eval2000_hires.ctm.filt.sys
+
+# _3f is as _3e, but modifying the splicing setup to add (left) recurrence:
+# added the :3's in   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3:-3 -6,-3,0,3:-3 -6,-3,0,3:-3"
+# Therefore it's
+# no longer really a tdnn, more like an RNN combined with TDNN.  BTW, I'm not re-dumping egs with extra
+# context, and this isn't really ideal - I want to see if this seems promising first.
+
+# _3e is as _3d, but increasing the --num-jesus-blocks from 100 (the default)
+# to 200 in order to reduce computation in the Jesus layer.
+
+# _3d is as _2y, and re-using the egs, but using --jesus-opts and
+# configs from make_jesus_configs.py.
+#  --jesus-opts "--affine-output-dim 600 --jesus-output-dim 1800 --jesus-hidden-dim 15000" \
+#   --splice-indexes "-2,-1,0,1,2 -1,2 -3,0,3 -6,-3,0,3 -6,-3,0,3"
+
+# _2y is as _2o, but increasing the --frames-per-iter by a factor of 1.5, from
+# 800k to 1.2 million.  The aim is to avoid some of the per-job overhead
+# (model-averaging, etc.), since each iteration takes only a minute or so.
+#  I added the results to the table below.  It seems the same on average-
+# which is good.  We'll probably keep this configuration.
+
+# _2o is as _2m, but going back to our original 2-state topology, which it turns
+# out that I never tested to WER.
+# hm--- it's about the same, or maybe slightly better!
+# caution: accidentally overwrote most of this dir, but kept the key stuff.
+
+# note: when I compare with the rerun of 2o (not shown), this run is actually
+# better.
+# WER on          2m        2o          2y    [ now comparing 2o->2y:]
+# train_dev,tg    17.22     17.24       16.99  0.2% better
+# train_dev,fg    15.87     15.93       15.86  0.1% better
+# eval2000,tg     18.7      18.7        18.9   0.2% worse
+# eval2000,fg     17.0      16.9        17.0   0.1% worse
+
+# train-prob,final  -0.0803   -0.0835
+# valid-prob,final  -0.0116   -0.0122
+
+# _2m is as _2k, but setting --leftmost-questions-truncate=-1, i.e. disabling
+# that mechanism.
+
+# _2k is as _2i, but doing the same change as in _s -> _2e, in which we
+#  set --apply-deriv-weights false and --frames-overlap-per-eg 0.
+
+# _2i is as _2d but with a new set of code for estimating the LM, in which we compute
+# the log-like change when deciding which states to back off.  The code is not the same
+# as the one in 2{f,g,h}.  We have only the options --num-extra-lm-states=2000.  By
+# default it estimates a 4-gram, with 3-gram as the no-prune order.  So the configuration
+# is quite similar to 2d, except new/more-exact code is used.
+
+# _2d is as _2c but with different LM options:
+# --lm-opts "--ngram-order=4 --leftmost-context-questions=/dev/null --num-extra-states=2000"
+# ... this gives us a kind of pruned 4-gram language model, instead of a 3-gram.
+# the --leftmost-context-questions=/dev/null option overrides the leftmost-context-questions
+# provided from the tree-building, and effectively puts the leftmost context position as a single
+# set.
+#   This seems definitely helpful: on train_dev, with tg improvement is 18.12->17.55 and with fg
+#  from 16.73->16.14; and on eval2000, with tg from 19.8->19.5 and with fg from 17.8->17.6.
+
+# _2c is as _2a but after a code change in which we start using transition-scale
+# and self-loop-scale of 1 instead of zero in training; we change the options to
+# mkgraph used in testing, to set the scale to 1.0.  This shouldn't affect
+# results at all; it's is mainly for convenience in pushing weights in graphs,
+# and checking that graphs are stochastic.
+
+# _2a is as _z but setting --lm-opts "--num-extra-states=8000".
+
+# _z is as _x but setting  --lm-opts "--num-extra-states=2000".
+# (see also y, which has --num-extra-states=500).
+
+# _x is as _s but setting  --lm-opts "--num-extra-states=0".
+#  this is a kind of repeat of the u->v experiment, where it seemed to make things
+#  worse, but there were other factors involved in that so I want to be sure.
+
+# _s is as _q but setting pdf-boundary-penalty to 0.0
+# This is helpful: 19.8->18.0 after fg rescoring on all of eval2000,
+# and 18.07 -> 16.96 on train_dev, after fg rescoring.
+
+# _q is as _p except making the same change as from n->o, which
+# reduces the parameters to try to reduce over-training.  We reduce
+# relu-dim from 1024 to 850, and target num-states from 12k to 9k,
+# and modify the splicing setup.
+# note: I don't rerun the tree-building, I just use the '5o' treedir.
+
+# _p is as _m except with a code change in which we switch to a different, more
+# exact mechanism to deal with the edges of the egs, and correspondingly
+# different script options... we now dump weights with the egs, and apply the
+# weights to the derivative w.r.t. the output instead of using the
+# --min-deriv-time and --max-deriv-time options.  Increased the frames-overlap
+# to 30 also.  This wil.  give 10 frames on each side with zero derivs, then
+# ramping up to a weight of 1.0 over 10 frames.
+
+# _m is as _k but after a code change that makes the denominator FST more
+# compact.  I am rerunning in order to verify that the WER is not changed (since
+# it's possible in principle that due to edge effects related to weight-pushing,
+# the results could be a bit different).
+#  The results are inconsistently different but broadly the same.  On all of eval2000,
+#  the change k->m is 20.7->20.9 with tg LM and 18.9->18.6 after rescoring.
+#  On the train_dev data, the change is  19.3->18.9 with tg LM and 17.6->17.6 after rescoring.
+
+
+# _k is as _i but reverting the g->h change, removing the --scale-max-param-change
+# option and setting max-param-change to 1..  Using the same egs.
+
+# _i is as _h but longer egs: 150 frames instead of 75, and
+# 128 elements per minibatch instead of 256.
+
+# _h is as _g but different application of max-param-change (use --scale-max-param-change true)
+
+# _g is as _f but more splicing at last layer.
+
+# _f is as _e but with 30 as the number of left phone classes instead
+# of 10.
+
+# _e is as _d but making it more similar in configuration to _b.
+# (turns out b was better than a after all-- the egs' likelihoods had to
+# be corrected before comparing them).
+# the changes (vs. d) are: change num-pdfs target from 8k to 12k,
+# multiply learning rates by 5, and set final-layer-normalize-target to 0.5.
+
+# _d is as _c but with a modified topology (with 4 distinct states per phone
+# instead of 2), and a slightly larger num-states (8000) to compensate for the
+# different topology, which has more states.
+
+# _c is as _a but getting rid of the final-layer-normalize-target (making it 1.0
+# as the default) as it's not clear that it was helpful; using the old learning-rates;
+# and modifying the target-num-states to 7000.
+
+# _b is as as _a except for configuration changes: using 12k num-leaves instead of
+# 5k; using 5 times larger learning rate, and --final-layer-normalize-target=0.5,
+# which will make the final layer learn less fast compared with other layers.
+
+set -e
+
+# configs for 'chain'
+stage=12
+train_stage=-10
+get_egs_stage=-10
+speed_perturb=true
+dir=exp/chain/tdnn_6a # Note: _sp will get added to this if $speed_perturb == true.
+
+# training options
+num_epochs=4
+initial_effective_lrate=0.001
+final_effective_lrate=0.0001
+leftmost_questions_truncate=-1
+max_param_change=2.0
+final_layer_normalize_target=0.5
+num_jobs_initial=3
+num_jobs_final=16
+minibatch_size=128
+frames_per_eg=150
+remove_egs=false
+
+# End configuration section.
+echo "$0 $@"  # Print the command line for logging
+
+. cmd.sh
+. ./path.sh
+. ./utils/parse_options.sh
+
+if ! cuda-compiled; then
+  cat <<EOF && exit 1
+This script is intended to be used with GPUs but you have not compiled Kaldi with CUDA
+If you want to use GPUs (and have them), go to src/, and configure and make on a machine
+where "nvcc" is installed.
+EOF
+fi
+
+# The iVector-extraction and feature-dumping parts are the same as the standard
+# nnet3 setup, and you can skip them by setting "--stage 8" if you have already
+# run those things.
+
+suffix=
+if [ "$speed_perturb" == "true" ]; then
+  suffix=_sp
+fi
+
+dir=${dir}$suffix
+train_set=train_nodup$suffix
+ali_dir=exp/tri4_ali_nodup$suffix
+treedir=exp/chain/tri5_2y_tree$suffix
+lang=data/lang_chain_2y
+
+
+# if we are using the speed-perturbed data we need to generate
+# alignments for it.
+local/nnet3/run_ivector_common.sh --stage $stage \
+  --speed-perturb $speed_perturb \
+  --generate-alignments $speed_perturb || exit 1;
+
+
+if [ $stage -le 9 ]; then
+  # Get the alignments as lattices (gives the CTC training more freedom).
+  # use the same num-jobs as the alignments
+  nj=$(cat exp/tri4_ali_nodup$suffix/num_jobs) || exit 1;
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/$train_set \
+    data/lang exp/tri4 exp/tri4_lats_nodup$suffix
+  rm exp/tri4_lats_nodup$suffix/fsts.*.gz # save space
+fi
+
+
+if [ $stage -le 10 ]; then
+  # Create a version of the lang/ directory that has one state per phone in the
+  # topo file. [note, it really has two states.. the first one is only repeated
+  # once, the second one has zero or more repeats.]
+  rm -rf $lang
+  cp -r data/lang $lang
+  silphonelist=$(cat $lang/phones/silence.csl) || exit 1;
+  nonsilphonelist=$(cat $lang/phones/nonsilence.csl) || exit 1;
+  # Use our special topology... note that later on may have to tune this
+  # topology.
+  steps/nnet3/chain/gen_topo.py $nonsilphonelist $silphonelist >$lang/topo
+fi
+
+if [ $stage -le 11 ]; then
+  # Build a tree using our new topology.
+  steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
+      --leftmost-questions-truncate $leftmost_questions_truncate \
+      --cmd "$train_cmd" 9000 data/$train_set $lang $ali_dir $treedir
+fi
+
+if [ $stage -le 12 ]; then
+  if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then
+    utils/create_split_dir.pl \
+     /export/b0{5,6,7,8}/$USER/kaldi-data/egs/swbd-$(date +'%m_%d_%H_%M')/s5c/$dir/egs/storage $dir/egs/storage
+  fi
+
+ touch $dir/egs/.nodelete # keep egs around when that run dies.
+
+ steps/nnet3/chain/train_tdnn.sh --stage $train_stage \
+    --xent-regularize 0.1 \
+    --leaky-hmm-coefficient 0.1 \
+    --l2-regularize 0.00005 \
+    --egs-dir exp/chain/tdnn_2y_sp/egs \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 1800 --final-hidden-dim 400 --jesus-hidden-dim 3000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25  --self-repair-scale 0.00001" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3,skip0 -3,0,3,skip0 -3,0,3,skip0 -6,-3,0,skip-3" \
+    --apply-deriv-weights false \
+    --frames-per-iter 1200000 \
+    --lm-opts "--num-extra-lm-states=2000" \
+    --get-egs-stage $get_egs_stage \
+    --minibatch-size $minibatch_size \
+    --egs-opts "--frames-overlap-per-eg 0" \
+    --frames-per-eg $frames_per_eg \
+    --num-epochs $num_epochs --num-jobs-initial $num_jobs_initial --num-jobs-final $num_jobs_final \
+    --feat-type raw \
+    --online-ivector-dir exp/nnet3/ivectors_${train_set} \
+    --cmvn-opts "--norm-means=false --norm-vars=false" \
+    --initial-effective-lrate $initial_effective_lrate --final-effective-lrate $final_effective_lrate \
+    --max-param-change $max_param_change \
+    --cmd "$decode_cmd" \
+    --remove-egs $remove_egs \
+    data/${train_set}_hires $treedir exp/tri4_lats_nodup$suffix $dir  || exit 1;
+fi
+
+if [ $stage -le 13 ]; then
+  # Note: it might appear that this $lang directory is mismatched, and it is as
+  # far as the 'topo' is concerned, but this script doesn't read the 'topo' from
+  # the lang directory.
+  utils/mkgraph.sh --self-loop-scale 1.0 data/lang_sw1_tg $dir $dir/graph_sw1_tg
+fi
+
+decode_suff=sw1_tg
+graph_dir=$dir/graph_sw1_tg
+if [ $stage -le 14 ]; then
+  for decode_set in train_dev eval2000; do
+      (
+      steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \
+         --extra-left-context 20 \
+          --nj 50 --cmd "$decode_cmd" \
+          --online-ivector-dir exp/nnet3/ivectors_${decode_set} \
+         $graph_dir data/${decode_set}_hires $dir/decode_${decode_set}_${decode_suff} || exit 1;
+      if $has_fisher; then
+          steps/lmrescore_const_arpa.sh --cmd "$decode_cmd" \
+            data/lang_sw1_{tg,fsh_fg} data/${decode_set}_hires \
+            $dir/decode_${decode_set}_sw1_{tg,fsh_fg} || exit 1;
+      fi
+      ) &
+  done
+fi
+wait;
+exit 0;
diff --git a/egs/wsj/s5/steps/nnet3/make_jesus_configs.py b/egs/wsj/s5/steps/nnet3/make_jesus_configs.py
index d0008e81711..39ed9f961e0 100755
--- a/egs/wsj/s5/steps/nnet3/make_jesus_configs.py
+++ b/egs/wsj/s5/steps/nnet3/make_jesus_configs.py
@@ -112,7 +112,7 @@ def __init__(self, config_string, input_dim, input_name):
         self.input_dim = input_dim
         self.input_name = input_name
 
-        m = re.search("(mean|mean\+stddev)\((-?\d+):(-?\d+):(-?\d+):(-?\d+)\)",
+        m = re.match("^(mean|mean\+stddev)\((-?\d+):(-?\d+):(-?\d+):(-?\d+)\)$",
                       config_string)
         if m == None:
             sys.exit("Invalid splice-index or statistics-config string: " + config_string)
@@ -204,8 +204,9 @@ def WriteConfigs(self, f):
                 try:
                     x = StatisticsConfig(s, 100, 'foo')
                 except:
-                    sys.exit("The following element of the splicing array is not a valid specifier "
-                    "of statistics: " + s)
+                    if re.match("skip(-?\d+)$", s) == None:
+                        sys.exit("The following element of the splicing array is not a valid specifier "
+                                 "of statistics or of the form skipDDD: " + s)
 
         if leftmost_splice == 10000 or rightmost_splice == -10000:
             sys.exit("invalid element of --splice-indexes: " + string)
@@ -295,12 +296,21 @@ def WriteConfigs(self, f):
                 splices.append('Offset({0}, {1})'.format(cur_output, offset))
                 spliced_dims.append(cur_affine_output_dim)
             except:
-                # it's not an integer offset, so assume it specifies the
-                # statistics-extraction.
-                stats = StatisticsConfig(s, cur_affine_output_dim, cur_output)
-                stats.WriteConfigs(f)
-                splices.append(stats.Descriptor())
-                spliced_dims.extend(stats.OutputDims())
+                # it's not an integer offset, so assume it either specifies the
+                # statistics-extraction, or is of the form skipXX where XX is an
+                # integer offset (this takes as input the previous post-jesus layer).
+                m = re.match("skip(-?\d+)$", s)
+                if m != None:
+                    if l <= 2:
+                        sys.exit("You cannot use skip-splicing for the 1st 2 layers")
+                    offset = m.group(1)
+                    splices.append("Offset(post-jesus{0}, {1})".format(l-1, offset))
+                    spliced_dims.append(args.jesus_forward_output_dim)
+                else:
+                    stats = StatisticsConfig(s, cur_affine_output_dim, cur_output)
+                    stats.WriteConfigs(f)
+                    splices.append(stats.Descriptor())
+                    spliced_dims.extend(stats.OutputDims())
 
         # get the input to the Jesus layer.
         cur_input = 'Append({0})'.format(', '.join(splices))

From d25785ddc1d24269cb71497092bfa209e9fbe84d Mon Sep 17 00:00:00 2001
From: Xingyu Na <asr.naxingyu@gmail.com>
Date: Tue, 16 Feb 2016 10:48:25 +0800
Subject: [PATCH 09/32] small bug fix for fisher_swbd data prep

---
 egs/fisher_swbd/s5/local/swbd1_data_prep.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs/fisher_swbd/s5/local/swbd1_data_prep.sh b/egs/fisher_swbd/s5/local/swbd1_data_prep.sh
index 0a8a375b7ed..98a12e1c0a3 100755
--- a/egs/fisher_swbd/s5/local/swbd1_data_prep.sh
+++ b/egs/fisher_swbd/s5/local/swbd1_data_prep.sh
@@ -102,7 +102,7 @@ local/swbd1_map_words.pl -f 2- $dir/transcripts2.txt  > $dir/text  # final trans
 
 # format acronyms in text
 python local/map_acronyms_transcripts.py -i $dir/text -o $dir/text_map \
-  -M data/local/dict/acronyms_swbd.map
+  -M data/local/dict_nosp/acronyms_swbd.map
 cp $dir/text $dir/text_bk
 mv $dir/text_map $dir/text
 

From e06745d58e24152071a72b04b11ca67d16c967c4 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Tue, 16 Feb 2016 02:20:35 -0500
Subject: [PATCH 10/32] adding some newer results for swbd+chain tuning;
 reverting skip-splicing option which I found not helpful.

---
 egs/swbd/s5c/local/chain/run_tdnn_5y.sh      | 10 ++++++
 egs/swbd/s5c/local/chain/run_tdnn_5z.sh      |  9 ++++++
 egs/swbd/s5c/local/chain/run_tdnn_6a.sh      | 33 +++++++++++++++-----
 egs/wsj/s5/steps/nnet3/make_jesus_configs.py | 28 ++++++-----------
 4 files changed, 54 insertions(+), 26 deletions(-)

diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5y.sh b/egs/swbd/s5c/local/chain/run_tdnn_5y.sh
index f89c1f5deac..54769c23734 100755
--- a/egs/swbd/s5c/local/chain/run_tdnn_5y.sh
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5y.sh
@@ -11,6 +11,16 @@
 # hidden parts of the network.  Hopefully this will reduce overtraining, since
 # the hidden parts of the network are regularized by the --xent-regularize option.
 
+# The diagnostics were improved, but the WER is no better (or maybe slightly worse).
+#local/chain/compare_wer.sh 5v 5y
+#System                       5v        5y
+#WER on train_dev(tg)      15.38     15.50
+#WER on train_dev(fg)      14.39     14.37
+#WER on eval2000(tg)        17.4      17.5
+#WER on eval2000(fg)        15.7      15.7
+#Final train prob       -0.11156 -0.111636
+#Final valid prob      -0.131797 -0.128892
+
 # _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500.
 
 # WER is almost the same, perhaps <0.1% worse; diagnostics are slightly worse.
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_5z.sh b/egs/swbd/s5c/local/chain/run_tdnn_5z.sh
index 0f3e89470d8..57910eb00c7 100755
--- a/egs/swbd/s5c/local/chain/run_tdnn_5z.sh
+++ b/egs/swbd/s5c/local/chain/run_tdnn_5z.sh
@@ -1,6 +1,15 @@
 #!/bin/bash
 
 # _5z is as _5v, but adding skip-splicing (a new configuration option)
+# It seems definitely not helpful.  I'll remove the option soon.
+#local/chain/compare_wer.sh 5v 5z
+#System                       5v        5z
+#WER on train_dev(tg)      15.38     15.60
+#WER on train_dev(fg)      14.39     14.50
+#WER on eval2000(tg)        17.4      17.6
+#WER on eval2000(fg)        15.7      15.9
+#Final train prob       -0.11156 -0.113823
+#Final valid prob      -0.131797 -0.131356
 
 # _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500.
 
diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6a.sh b/egs/swbd/s5c/local/chain/run_tdnn_6a.sh
index 70bd894f313..12589033819 100755
--- a/egs/swbd/s5c/local/chain/run_tdnn_6a.sh
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6a.sh
@@ -1,10 +1,29 @@
 #!/bin/bash
 
-# _5z is as _5z, but adding the change in configuration that
-# we made from _5v to _5y (moving some parameters from final layer
-# to hidden parts of network)
-
-# _5z is as _5v, but adding skip-splicing (a new configuration option)
+# _6a is as _5y, where we keep the hidden parts of the network a bit larger
+# but take the final-hidden-dim back up to 500, which is the same as what
+# it was in 5v.
+
+# _5y is as _5v, but rebalancing the network to have fewer parameters in the
+# final layer and more in the hidden parts, by reducing --final-hidden-dim from 500
+# (it defaults to --jesus-forward-hidden-dim) to 400, and increasing
+#  --jesus-forward-input-dim from 500 to 600 and
+#  --jesus-forward-output-dim from 1700 to 1800,
+# and --jesus-hidden-dim from 2500 to 3000 (note: I don't really expect this last change
+# to make much of a difference).
+# Very roughly, we're moving about a million parameters from the final layer to the
+# hidden parts of the network.  Hopefully this will reduce overtraining, since
+# the hidden parts of the network are regularized by the --xent-regularize option.
+
+# The diagnostics were improved, but the WER is no better (or maybe slightly worse).
+#local/chain/compare_wer.sh 5v 5y
+#System                       5v        5y
+#WER on train_dev(tg)      15.38     15.50
+#WER on train_dev(fg)      14.39     14.37
+#WER on eval2000(tg)        17.4      17.5
+#WER on eval2000(fg)        15.7      15.7
+#Final train prob       -0.11156 -0.111636
+#Final valid prob      -0.131797 -0.128892
 
 # _5v is as _5t, but further reducing the --jesus-hidden-dim from 3500 to 2500.
 
@@ -412,8 +431,8 @@ if [ $stage -le 12 ]; then
     --leaky-hmm-coefficient 0.1 \
     --l2-regularize 0.00005 \
     --egs-dir exp/chain/tdnn_2y_sp/egs \
-    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 1800 --final-hidden-dim 400 --jesus-hidden-dim 3000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25  --self-repair-scale 0.00001" \
-    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3,skip0 -3,0,3,skip0 -3,0,3,skip0 -6,-3,0,skip-3" \
+    --jesus-opts "--jesus-forward-input-dim 600  --jesus-forward-output-dim 1800 --final-hidden-dim 500 --jesus-hidden-dim 3000 --jesus-stddev-scale 0.2 --final-layer-learning-rate-factor 0.25  --self-repair-scale 0.00001" \
+    --splice-indexes "-1,0,1 -1,0,1,2 -3,0,3 -3,0,3 -3,0,3 -6,-3,0" \
     --apply-deriv-weights false \
     --frames-per-iter 1200000 \
     --lm-opts "--num-extra-lm-states=2000" \
diff --git a/egs/wsj/s5/steps/nnet3/make_jesus_configs.py b/egs/wsj/s5/steps/nnet3/make_jesus_configs.py
index 39ed9f961e0..d0008e81711 100755
--- a/egs/wsj/s5/steps/nnet3/make_jesus_configs.py
+++ b/egs/wsj/s5/steps/nnet3/make_jesus_configs.py
@@ -112,7 +112,7 @@ def __init__(self, config_string, input_dim, input_name):
         self.input_dim = input_dim
         self.input_name = input_name
 
-        m = re.match("^(mean|mean\+stddev)\((-?\d+):(-?\d+):(-?\d+):(-?\d+)\)$",
+        m = re.search("(mean|mean\+stddev)\((-?\d+):(-?\d+):(-?\d+):(-?\d+)\)",
                       config_string)
         if m == None:
             sys.exit("Invalid splice-index or statistics-config string: " + config_string)
@@ -204,9 +204,8 @@ def WriteConfigs(self, f):
                 try:
                     x = StatisticsConfig(s, 100, 'foo')
                 except:
-                    if re.match("skip(-?\d+)$", s) == None:
-                        sys.exit("The following element of the splicing array is not a valid specifier "
-                                 "of statistics or of the form skipDDD: " + s)
+                    sys.exit("The following element of the splicing array is not a valid specifier "
+                    "of statistics: " + s)
 
         if leftmost_splice == 10000 or rightmost_splice == -10000:
             sys.exit("invalid element of --splice-indexes: " + string)
@@ -296,21 +295,12 @@ def WriteConfigs(self, f):
                 splices.append('Offset({0}, {1})'.format(cur_output, offset))
                 spliced_dims.append(cur_affine_output_dim)
             except:
-                # it's not an integer offset, so assume it either specifies the
-                # statistics-extraction, or is of the form skipXX where XX is an
-                # integer offset (this takes as input the previous post-jesus layer).
-                m = re.match("skip(-?\d+)$", s)
-                if m != None:
-                    if l <= 2:
-                        sys.exit("You cannot use skip-splicing for the 1st 2 layers")
-                    offset = m.group(1)
-                    splices.append("Offset(post-jesus{0}, {1})".format(l-1, offset))
-                    spliced_dims.append(args.jesus_forward_output_dim)
-                else:
-                    stats = StatisticsConfig(s, cur_affine_output_dim, cur_output)
-                    stats.WriteConfigs(f)
-                    splices.append(stats.Descriptor())
-                    spliced_dims.extend(stats.OutputDims())
+                # it's not an integer offset, so assume it specifies the
+                # statistics-extraction.
+                stats = StatisticsConfig(s, cur_affine_output_dim, cur_output)
+                stats.WriteConfigs(f)
+                splices.append(stats.Descriptor())
+                spliced_dims.extend(stats.OutputDims())
 
         # get the input to the Jesus layer.
         cur_input = 'Append({0})'.format(', '.join(splices))

From 341e0f023a5083d3c31e207097be1ca254bf1e80 Mon Sep 17 00:00:00 2001
From: Gaurav Kumar <gaurav.bison@gmail.com>
Date: Tue, 16 Feb 2016 20:00:45 -0500
Subject: [PATCH 11/32] Changes to allow the large spanish word list to be
 downloaded if not present

---
 .../s5/local/fsp_prepare_dict.sh              | 30 ++++++++++--
 .../s5/local/merge_lexicons.py                | 47 ++++++++++---------
 2 files changed, 50 insertions(+), 27 deletions(-)

diff --git a/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh b/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh
index 0f2bd037ba0..824edd99da8 100755
--- a/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh
+++ b/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh
@@ -22,12 +22,32 @@ lexicon=$1
 #Get all unique words, remove punctuation.
 if [ $stage -le 0 ]; then
   cat $datadir/text | sed 's:[0-9][0-9]\S*::g' | sed 's:[\.,\?]::g' | tr " " "\n" | sort | uniq | awk '{if (NF > 0){ print; }}' > $tmpdir/uniquewords
-  if [ -f "/export/a04/gkumar/corpora/gigaword-spanish/bin/gigaword-lexicon.json" ]; then
-    # Merge with gigaword corpus
-    $local/merge_lexicons.py
-    mv $tmpdir/uniquewords $tmpdir/uniquewords.small
-    mv $tmpdir/uniquewords64k $tmpdir/uniquewords
+  if [ -f "${tmpdir}/es_wordlist.json" ]; then
+    echo "Could not find the large collection of Spanish words es_wordlist.json"
+    echo "Trying to download it via wget"
+
+    if ! which wget >&/dev/null; then
+      echo "This script requires you to first install wget"
+      exit 1;
+    fi
+
+    cwd=`pwd`
+    cd $tmpdir
+    wget -T 10 -t 3 -c http://www.openslr.org/resources/21/es_wordlist.json.tgz
+
+    if [ ! -e ${tmpdir}/es_wordlist.json.tgz ]; then
+      echo "Download of the large Spanish word list failed"
+      exit 1;
+    fi
+
+    tar -xvzfo es_wordlist.json.tgz || exit 1;
+    cd $cwd
   fi
+
+  # Merge with gigaword corpus
+  $local/merge_lexicons.py ${tmpdir} ${lexicon}
+  mv $tmpdir/uniquewords $tmpdir/uniquewords.small
+  mv $tmpdir/uniquewords64k $tmpdir/uniquewords
 fi
 
 #Then get the list of phones form basic_rules in the lexicon folder
diff --git a/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py b/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py
index 8c67ae56804..5c09f09bc35 100755
--- a/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py
+++ b/egs/fisher_callhome_spanish/s5/local/merge_lexicons.py
@@ -7,55 +7,58 @@
 import sys
 import json
 import codecs
-import os
 import operator
 
-wordlimit=64000
-uw_fisher="data/local/tmp/uniquewords"
-uw_gigaword="/export/a04/gkumar/corpora/gigaword-spanish/bin/gigaword-lexicon.json"
-uw_LDC="/export/corpora/LDC/LDC96L16/callhome_spanish_lexicon_970908/preferences"
+wordlimit = 64000
+tmpdir = sys.argv[1]
+ldc_lexicon = sys.argv[2]
+uw_fisher = tmpdir + "/uniquewords"
+uw_gigaword = tmpdir + "/es_wordlist.json"
+uw_LDC = ldc_lexicon + "/callhome_spanish_lexicon_970908/preferences"
 
 merged_lexicon = []
 # All three lexicons are in different formats
 # First add the data from lexicon_fisher (A) into the dictionary
 fisher = codecs.open(uw_fisher, encoding='utf-8')
 for line in fisher:
-	merged_lexicon.append(line.strip())
+    merged_lexicon.append(line.strip())
 fisher.close()
 
-print "After adding the fisher data, the lexicon contains " + str(len(merged_lexicon)) + " entries."
+print "After adding the fisher data, the lexicon contains " \
+      + str(len(merged_lexicon)) + " entries."
 
 # Now add data from the LDC lexicon
 ldc = codecs.open(uw_LDC, encoding='iso-8859-1')
-for line in ldc: 
-	entries = line.strip().split('\t')
-	if entries[0].lower() not in merged_lexicon:
-		merged_lexicon.append(entries[0].lower())
+for line in ldc:
+    entries = line.strip().split('\t')
+    if entries[0].lower() not in merged_lexicon:
+        merged_lexicon.append(entries[0].lower())
 
-print "After adding the LDC data, the lexicon contains " + str(len(merged_lexicon)) + " entries."
+print "After adding the LDC data, the lexicon contains " \
+      + str(len(merged_lexicon)) + " entries."
 
 # Finally add the gigaword data
 gigaword = json.load(open(uw_gigaword))
 gigaword = reversed(sorted(gigaword.iteritems(), key=operator.itemgetter(1)))
 
 for item in gigaword:
-	# We need a maximum of wordlimit words in the lexicon
-	if len(merged_lexicon) == wordlimit:
-		break	
+    # We need a maximum of wordlimit words in the lexicon
+    if len(merged_lexicon) == wordlimit:
+        break
 
-	if item[0].lower() not in merged_lexicon:
-		merged_lexicon.append(item[0].lower())
-	
-print "After adding the Gigaword data, the lexicon contains " + str(len(merged_lexicon)) + " entries."
+    if item[0].lower() not in merged_lexicon:
+        merged_lexicon.append(item[0].lower())
+
+print "After adding the Gigaword data, the lexicon contains " \
+      + str(len(merged_lexicon)) + " entries."
 
 # Now write the uniquewords to a file
-lf = codecs.open('data/local/tmp/uniquewords64k', encoding='utf-8', mode='w+')
+lf = codecs.open(tmpdir + '/uniquewords64k', encoding='utf-8', mode='w+')
 ltuples = sorted(merged_lexicon)
 
 for item in ltuples:
-	lf.write(item + "\n")
+    lf.write(item + "\n")
 
 lf.close()
 
 print "Finshed writing unique words"
-

From b7aa6b126e93806cb72036f4be9db95d4bb002e4 Mon Sep 17 00:00:00 2001
From: Gaurav Kumar <gaurav.bison@gmail.com>
Date: Tue, 16 Feb 2016 22:15:36 -0500
Subject: [PATCH 12/32] Small changes. Fixes #494

---
 egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh b/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh
index 824edd99da8..dae46cfddf5 100755
--- a/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh
+++ b/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh
@@ -22,7 +22,7 @@ lexicon=$1
 #Get all unique words, remove punctuation.
 if [ $stage -le 0 ]; then
   cat $datadir/text | sed 's:[0-9][0-9]\S*::g' | sed 's:[\.,\?]::g' | tr " " "\n" | sort | uniq | awk '{if (NF > 0){ print; }}' > $tmpdir/uniquewords
-  if [ -f "${tmpdir}/es_wordlist.json" ]; then
+  if [ ! -f "${tmpdir}/es_wordlist.json" ]; then
     echo "Could not find the large collection of Spanish words es_wordlist.json"
     echo "Trying to download it via wget"
 
@@ -40,7 +40,7 @@ if [ $stage -le 0 ]; then
       exit 1;
     fi
 
-    tar -xvzfo es_wordlist.json.tgz || exit 1;
+    tar -xovzf es_wordlist.json.tgz || exit 1;
     cd $cwd
   fi
 

From 577659a6836c4419a417aac6e9d9d7a659ccd3af Mon Sep 17 00:00:00 2001
From: Xingyu Na <asr.naxingyu@gmail.com>
Date: Wed, 17 Feb 2016 12:27:38 +0800
Subject: [PATCH 13/32] fix swbd1 data prep duplicates

---
 .../s5/local/swbd1_data_download.sh           | 17 ++--------------
 egs/fisher_swbd/s5/local/swbd1_data_prep.sh   | 20 +------------------
 egs/swbd/s5c/local/swbd1_data_download.sh     | 17 ++--------------
 egs/swbd/s5c/local/swbd1_data_prep.sh         | 19 +-----------------
 4 files changed, 6 insertions(+), 67 deletions(-)

diff --git a/egs/fisher_swbd/s5/local/swbd1_data_download.sh b/egs/fisher_swbd/s5/local/swbd1_data_download.sh
index 6dac146c26b..95c9d5e58a4 100755
--- a/egs/fisher_swbd/s5/local/swbd1_data_download.sh
+++ b/egs/fisher_swbd/s5/local/swbd1_data_download.sh
@@ -10,18 +10,11 @@
 ## you unpacked this.  We are just doing a "find" command to locate
 ## the .sph files.
 
-## The second input is optional, which should point to a directory containing
-## Switchboard transcriptions/documentations (specifically, the conv.tab file).
-## If specified, the script will try to use the actual speaker PINs provided 
-## with the corpus instead of the conversation side ID (Kaldi default). We 
-## will be using "find" to locate this file so we don't make any assumptions
-## on the directory structure. (Peng Qi, Aug 2014)
-
 . path.sh
 
 #check existing directories
-if [ $# != 1 -a $# != 2 ]; then
-  echo "Usage: swbd1_data_prep_edin.sh /path/to/SWBD [/path/to/SWBD_DOC]"
+if [ $# != 1 ]; then
+  echo "Usage: swbd1_data_download.sh /path/to/SWBD"
   exit 1; 
 fi 
 
@@ -30,18 +23,12 @@ SWBD_DIR=$1
 dir=data/local/train_swbd
 mkdir -p $dir
 
-
 # Audio data directory check
 if [ ! -d $SWBD_DIR ]; then
   echo "Error: run.sh requires a directory argument"
   exit 1; 
 fi  
 
-sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
-[ ! -x $sph2pipe ] \
-  && echo "Could not execute the sph2pipe program at $sph2pipe" && exit 1;
-
-
 # Trans directory check
 if [ ! -d $SWBD_DIR/transcriptions/swb_ms98_transcriptions ]; then
   ( 
diff --git a/egs/fisher_swbd/s5/local/swbd1_data_prep.sh b/egs/fisher_swbd/s5/local/swbd1_data_prep.sh
index 98a12e1c0a3..54513437dbe 100755
--- a/egs/fisher_swbd/s5/local/swbd1_data_prep.sh
+++ b/egs/fisher_swbd/s5/local/swbd1_data_prep.sh
@@ -14,7 +14,7 @@
 
 #check existing directories
 if [ $# != 1 ]; then
-  echo "Usage: swbd1_data_prep_edin.sh /path/to/SWBD"
+  echo "Usage: swbd1_data_prep.sh /path/to/SWBD"
   exit 1; 
 fi 
 
@@ -23,7 +23,6 @@ SWBD_DIR=$1
 dir=data/local/train_swbd
 mkdir -p $dir
 
-
 # Audio data directory check
 if [ ! -d $SWBD_DIR ]; then
   echo "Error: run.sh requires a directory argument"
@@ -34,23 +33,6 @@ sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
 [ ! -x $sph2pipe ] \
   && echo "Could not execute the sph2pipe program at $sph2pipe" && exit 1;
 
-
-# Trans directory check
-if [ ! -d $SWBD_DIR/transcriptions/swb_ms98_transcriptions ]; then
-  # To get the SWBD transcriptions and dict, do:
-  echo " *** Downloading transcriptions and dictionary ***"   
-  ( 
-    cd $dir;
-    wget http://www.openslr.org/resources/5/switchboard_word_alignments.tar.gz ||
-    wget http://www.isip.piconepress.com/projects/switchboard/releases/switchboard_word_alignments.tar.gz
-    tar -xf switchboard_word_alignments.tar.gz
-  )
-else
-  echo "Directory with transcriptions exists, skipping downloading"
-  [ -f $dir/swb_ms98_transcriptions ] \
-    || ln -sf $SWBD_DIR/transcriptions/swb_ms98_transcriptions $dir/
-fi
-
 # Option A: SWBD dictionary file check
 [ ! -f $dir/swb_ms98_transcriptions/sw-ms98-dict.text ] && \
   echo  "SWBD dictionary file does not exist" &&  exit 1;
diff --git a/egs/swbd/s5c/local/swbd1_data_download.sh b/egs/swbd/s5c/local/swbd1_data_download.sh
index dd3559d2b45..d8f076b5141 100755
--- a/egs/swbd/s5c/local/swbd1_data_download.sh
+++ b/egs/swbd/s5c/local/swbd1_data_download.sh
@@ -10,18 +10,11 @@
 ## you unpacked this.  We are just doing a "find" command to locate
 ## the .sph files.
 
-## The second input is optional, which should point to a directory containing
-## Switchboard transcriptions/documentations (specifically, the conv.tab file).
-## If specified, the script will try to use the actual speaker PINs provided 
-## with the corpus instead of the conversation side ID (Kaldi default). We 
-## will be using "find" to locate this file so we don't make any assumptions
-## on the directory structure. (Peng Qi, Aug 2014)
-
 . path.sh
 
 #check existing directories
-if [ $# != 1 -a $# != 2 ]; then
-  echo "Usage: swbd1_data_prep_edin.sh /path/to/SWBD [/path/to/SWBD_DOC]"
+if [ $# != 1 ]; then
+  echo "Usage: swbd1_data_download.sh /path/to/SWBD"
   exit 1; 
 fi 
 
@@ -30,18 +23,12 @@ SWBD_DIR=$1
 dir=data/local/train
 mkdir -p $dir
 
-
 # Audio data directory check
 if [ ! -d $SWBD_DIR ]; then
   echo "Error: run.sh requires a directory argument"
   exit 1; 
 fi  
 
-sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
-[ ! -x $sph2pipe ] \
-  && echo "Could not execute the sph2pipe program at $sph2pipe" && exit 1;
-
-
 # Trans directory check
 if [ ! -d $SWBD_DIR/transcriptions/swb_ms98_transcriptions ]; then
   ( 
diff --git a/egs/swbd/s5c/local/swbd1_data_prep.sh b/egs/swbd/s5c/local/swbd1_data_prep.sh
index 57fb0ff56c8..9621e7fc06e 100755
--- a/egs/swbd/s5c/local/swbd1_data_prep.sh
+++ b/egs/swbd/s5c/local/swbd1_data_prep.sh
@@ -21,7 +21,7 @@
 
 #check existing directories
 if [ $# != 1 -a $# != 2 ]; then
-  echo "Usage: swbd1_data_prep_edin.sh /path/to/SWBD [/path/to/SWBD_DOC]"
+  echo "Usage: swbd1_data_prep.sh /path/to/SWBD [/path/to/SWBD_DOC]"
   exit 1; 
 fi 
 
@@ -41,23 +41,6 @@ sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
 [ ! -x $sph2pipe ] \
   && echo "Could not execute the sph2pipe program at $sph2pipe" && exit 1;
 
-
-# Trans directory check
-if [ ! -d $SWBD_DIR/transcriptions/swb_ms98_transcriptions ]; then
-  ( 
-    cd $dir;
-    if [ ! -d swb_ms98_transcriptions ]; then
-      echo " *** Downloading trascriptions and dictionary ***" 
-      wget http://www.isip.piconepress.com/projects/switchboard/releases/switchboard_word_alignments.tar.gz
-      tar -xf switchboard_word_alignments.tar.gz
-    fi
-  )
-else
-  echo "Directory with transcriptions exists, skipping downloading"
-  [ -f $dir/swb_ms98_transcriptions ] \
-    || ln -sf $SWBD_DIR/transcriptions/swb_ms98_transcriptions $dir/
-fi
-
 # Option A: SWBD dictionary file check
 [ ! -f $dir/swb_ms98_transcriptions/sw-ms98-dict.text ] && \
   echo  "SWBD dictionary file does not exist" &&  exit 1;

From 92994e2c9a6b60f3390ab19081b1759aa2caaa74 Mon Sep 17 00:00:00 2001
From: Gaurav Kumar <gaurav.bison@gmail.com>
Date: Wed, 17 Feb 2016 03:44:52 -0500
Subject: [PATCH 14/32] Handle multiple pronunciations in lexicon. Fixes #506

---
 egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh b/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh
index dae46cfddf5..6d04f53c7e5 100755
--- a/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh
+++ b/egs/fisher_callhome_spanish/s5/local/fsp_prepare_dict.sh
@@ -70,6 +70,7 @@ if [ $stage -le 2 ]; then
   # representation
   cat $tmpdir/uniquewords | $local/spron.pl $lexicon/callhome_spanish_lexicon_970908/preferences $lexicon/callhome_spanish_lexicon_970908/basic_rules \
     | cut -f1 | sed -r 's:#\S+\s\S+\s\S+\s\S+\s(\S+):\1:g' \
+    | awk -F '[/][/]' '{print $1}' \
     > $tmpdir/lexicon_raw
 fi
 

From a059643d198777975a58cce5816fd5e1d642963a Mon Sep 17 00:00:00 2001
From: vesis84 <vesis84@gmail.com>
Date: Wed, 17 Feb 2016 11:49:47 +0100
Subject: [PATCH 15/32] updating 'cmd.sh' for BUT cluster in various recipes,

---
 egs/ami/s5/cmd.sh     |  6 +++---
 egs/rm/s5/cmd.sh      |  2 +-
 egs/swbd/s5c/cmd.sh   |  2 +-
 egs/tedlium/s5/cmd.sh |  2 +-
 egs/timit/s5/cmd.sh   | 16 ++++++++--------
 egs/wsj/s5/cmd.sh     |  6 +++---
 6 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/egs/ami/s5/cmd.sh b/egs/ami/s5/cmd.sh
index 9bc2b3195ef..c3ac80d6846 100644
--- a/egs/ami/s5/cmd.sh
+++ b/egs/ami/s5/cmd.sh
@@ -28,10 +28,10 @@ export cntk_decode_cmd="queue.pl -l arch=*64* --mem 1G -pe smp 2"
 if [ "$(hostname -d)" == "fit.vutbr.cz" ]; then
   # BUT cluster:
   queue="all.q@@blade,all.q@@speech"
-  gpu_queue="long.q@supergpu*,long.q@dellgpu*,long.q@pcspeech-gpu,long.q@pcgpu*"
+  gpu_queue="long.q@@gpu"
   storage="matylda5"
-  export train_cmd="queue.pl -q $queue -l ram_free=1500M,mem_free=1500M,${storage}=1"
-  export decode_cmd="queue.pl -q $queue -l ram_free=2500M,mem_free=2500M,${storage}=0.5"
+  export train_cmd="queue.pl -q $queue -l ram_free=1.5G,mem_free=1.5G,${storage}=1"
+  export decode_cmd="queue.pl -q $queue -l ram_free=2.5G,mem_free=2.5G,${storage}=0.5"
   export cuda_cmd="queue.pl -q $gpu_queue -l gpu=1"
 fi 
 
diff --git a/egs/rm/s5/cmd.sh b/egs/rm/s5/cmd.sh
index 4478796305e..4d009813fd2 100644
--- a/egs/rm/s5/cmd.sh
+++ b/egs/rm/s5/cmd.sh
@@ -22,7 +22,7 @@ cuda_cmd="queue.pl -l arch=*64 -l gpu=1"
 # BUT cluster:
 if [ "$(hostname -d)" == "fit.vutbr.cz" ]; then
   queue="all.q@@blade,all.q@@speech"
-  gpu_queue="long.q@supergpu*,long.q@dellgpu*,long.q@pcspeech-gpu,long.q@pcgpu*"
+  gpu_queue="long.q@@gpu"
   storage="matylda5"
   export train_cmd="queue.pl -q $queue -l ram_free=1500M,mem_free=1500M,${storage}=1"
   export decode_cmd="queue.pl -q $queue -l ram_free=2500M,mem_free=2500M,${storage}=0.5"
diff --git a/egs/swbd/s5c/cmd.sh b/egs/swbd/s5c/cmd.sh
index 3f7de21e279..3dfaceaafab 100644
--- a/egs/swbd/s5c/cmd.sh
+++ b/egs/swbd/s5c/cmd.sh
@@ -15,7 +15,7 @@ export mkgraph_cmd="queue.pl -l arch=*64* --mem 4G"
 if [ "$(hostname -d)" == "fit.vutbr.cz" ]; then
   # BUT cluster:
   queue="all.q@@blade,all.q@@speech"
-  gpu_queue="long.q@supergpu*,long.q@dellgpu*,long.q@pcspeech-gpu,long.q@pcgpu*"
+  gpu_queue="long.q@@gpu"
   storage="matylda5"
   export train_cmd="queue.pl -q $queue -l ram_free=1.5G,mem_free=1.5G,${storage}=0.25"
   export decode_cmd="queue.pl -q $queue -l ram_free=2.5G,mem_free=2.5G,${storage}=0.1"
diff --git a/egs/tedlium/s5/cmd.sh b/egs/tedlium/s5/cmd.sh
index bed97d34020..ba7f120e599 100644
--- a/egs/tedlium/s5/cmd.sh
+++ b/egs/tedlium/s5/cmd.sh
@@ -19,7 +19,7 @@ host=$(hostname -f)
 if [ ${host#*.} == "fit.vutbr.cz" ]; then
   # BUT cluster:
   queue="all.q@@blade,all.q@@speech"
-  gpu_queue="long.q@supergpu*,long.q@dellgpu*,long.q@pcspeech-gpu,long.q@pcgpu*"
+  gpu_queue="long.q@@gpu"
   storage="matylda5"
   export train_cmd="queue.pl -q $queue -l ram_free=1500M,mem_free=1500M,${storage}=1"
   export decode_cmd="queue.pl -q $queue -l ram_free=2500M,mem_free=2500M,${storage}=0.5"
diff --git a/egs/timit/s5/cmd.sh b/egs/timit/s5/cmd.sh
index fd91a53ff73..0150f486298 100644
--- a/egs/timit/s5/cmd.sh
+++ b/egs/timit/s5/cmd.sh
@@ -12,18 +12,18 @@
 #export cuda_cmd=run.pl
 
 
-if [[ $(hostname -f) == *.clsp.jhu.edu ]]; then
+if [ "$(hostname -d)" == "clsp.jhu.edu" ]; then
   export train_cmd="queue.pl -l arch=*64*"
   export decode_cmd="queue.pl -l arch=*64* --mem 3G"
-  export mkgraph_cmd="queue.pl -l arch=*64* --mem 4G"
   export cuda_cmd="queue.pl -l gpu=1"
-elif [[ $(hostname -f) == *.fit.vutbr.cz ]]; then
+elif [ "$(hostname -d)" == "fit.vutbr.cz" ]; then
   #b) BUT cluster options
-  queue="all.q@@blade,all.q@@speech,all.q@dellgpu*,all.q@supergpu*"
-  export train_cmd="queue.pl -q $queue -l ram_free=2500M,mem_free=2500M,matylda5=0.5"
-  export decode_cmd="queue.pl -q $queue -l ram_free=3000M,mem_free=3000M,matylda5=0.1"
-  export mkgraph_cmd="queue.pl -q $queue -l ram_free=4G,mem_free=4G,matylda5=3"
-  export cuda_cmd="queue.pl -q long.q@pcspeech-gpu,long.q@dellgpu1,long.q@pcgpu*,long.q@supergpu1 -l gpu=1" 
+  queue="all.q@@blade,all.q@@speech"
+  gpu_queue="long.q@@gpu"
+  storage="matylda5"
+  export train_cmd="queue.pl -q $queue -l ram_free=1.5G,mem_free=1.5G,${storage}=0.5"
+  export decode_cmd="queue.pl -q $queue -l ram_free=2.5G,mem_free=2.5G,${storage}=0.1"
+  export cuda_cmd="queue.pl -q $gpu_queue -l gpu=1" 
 else
   echo "$0: you need to define options for your cluster."
   exit 1;
diff --git a/egs/wsj/s5/cmd.sh b/egs/wsj/s5/cmd.sh
index e5e8f9d26d4..96c48af42c1 100644
--- a/egs/wsj/s5/cmd.sh
+++ b/egs/wsj/s5/cmd.sh
@@ -21,9 +21,9 @@ export cuda_cmd="queue.pl -l gpu=1"
 #c) BUT cluster:
 if [ "$(hostname -d)" == "fit.vutbr.cz" ]; then
   queue="all.q@@blade,all.q@@speech"
-  gpu_queue="long.q@supergpu*,long.q@dellgpu*,long.q@pcspeech-gpu,long.q@pcgpu*"
+  gpu_queue="long.q@@gpu"
   storage="matylda5"
-  export train_cmd="queue.pl -q $queue -l ram_free=1500M,mem_free=1500M,${storage}=1"
-  export decode_cmd="queue.pl -q $queue -l ram_free=2500M,mem_free=2500M,${storage}=0.5"
+  export train_cmd="queue.pl -q $queue -l ram_free=1.5G,mem_free=1.5G,${storage}=1"
+  export decode_cmd="queue.pl -q $queue -l ram_free=2.5G,mem_free=2.5G,${storage}=0.5"
   export cuda_cmd="queue.pl -q $gpu_queue -l gpu=1"
 fi

From 04cd90211f8dffe4fccbf63de92dfddb86e423a3 Mon Sep 17 00:00:00 2001
From: vesis84 <vesis84@gmail.com>
Date: Wed, 17 Feb 2016 12:22:42 +0100
Subject: [PATCH 16/32] fixing tidigits data preparation,

---
 egs/tidigits/s5/local/tidigits_prepare_lang.sh | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/egs/tidigits/s5/local/tidigits_prepare_lang.sh b/egs/tidigits/s5/local/tidigits_prepare_lang.sh
index ff316514fc9..0bc08ab40a0 100755
--- a/egs/tidigits/s5/local/tidigits_prepare_lang.sh
+++ b/egs/tidigits/s5/local/tidigits_prepare_lang.sh
@@ -88,10 +88,11 @@ utils/make_lexicon_fst.pl $tmpdir/lexicon.txt 0.5 sil | \
 
 cp $lang/L.fst $lang/L_disambig.fst
 
-silphonelist=`cat $lang/phones/silence.csl | sed 's/:/ /g'`
-nonsilphonelist=`cat $lang/phones/nonsilence.csl | sed 's/:/ /g'`
-cat conf/topo.proto | sed "s:NONSILENCEPHONES:$nonsilphonelist:" | \
-   sed "s:SILENCEPHONES:$silphonelist:" > $lang/topo
+num_sil_states=5
+num_nonsil_states=3
+silphonelist=`cat $lang/phones/silence.csl`
+nonsilphonelist=`cat $lang/phones/nonsilence.csl`
+utils/gen_topo.pl $num_nonsil_states $num_sil_states $nonsilphonelist $silphonelist >$lang/topo
 
 # Now we prepare a simple grammar G.fst that's a kind of loop of
 # digits (no silence in this, since that's handled in L.fst)

From 2646cfb6fda83a358f27d06b0ec4b2cee95ef264 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Wed, 17 Feb 2016 18:11:25 -0500
Subject: [PATCH 17/32] chain+swbd experiments: tuning-experiment results

---
 egs/swbd/s5c/local/chain/run_tdnn_6a.sh | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/egs/swbd/s5c/local/chain/run_tdnn_6a.sh b/egs/swbd/s5c/local/chain/run_tdnn_6a.sh
index 12589033819..c618d1c0adf 100755
--- a/egs/swbd/s5c/local/chain/run_tdnn_6a.sh
+++ b/egs/swbd/s5c/local/chain/run_tdnn_6a.sh
@@ -4,6 +4,16 @@
 # but take the final-hidden-dim back up to 500, which is the same as what
 # it was in 5v.
 
+# No better.
+#local/chain/compare_wer.sh 5v 6a
+#System                       5v        6a
+#WER on train_dev(tg)      15.38     15.49
+#WER on train_dev(fg)      14.39     14.30
+#WER on eval2000(tg)        17.4      17.5
+#WER on eval2000(fg)        15.7      15.9
+#Final train prob       -0.11156 -0.109471
+#Final valid prob      -0.131797 -0.129035
+
 # _5y is as _5v, but rebalancing the network to have fewer parameters in the
 # final layer and more in the hidden parts, by reducing --final-hidden-dim from 500
 # (it defaults to --jesus-forward-hidden-dim) to 400, and increasing

From 187fa16fcfb4c1b55405717506964fe0d2245eb8 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Wed, 17 Feb 2016 18:38:42 -0500
Subject: [PATCH 18/32] chain branch: changing the self-repair code so that it
 should work well for sigmoid and tanh, although this is not tested yet.  some
 other bug-fixes.

---
 src/nnet3/nnet-component-itf.cc    |  57 +-------
 src/nnet3/nnet-component-itf.h     |   7 -
 src/nnet3/nnet-compute-test.cc     |  23 ++--
 src/nnet3/nnet-general-component.h |  20 +--
 src/nnet3/nnet-simple-component.cc | 206 +++++++++++++++++++++++++++--
 src/nnet3/nnet-simple-component.h  |  14 ++
 src/nnet3/nnet-test-utils.cc       |   4 +-
 7 files changed, 234 insertions(+), 97 deletions(-)

diff --git a/src/nnet3/nnet-component-itf.cc b/src/nnet3/nnet-component-itf.cc
index bbab3d3ba81..cdb43473090 100644
--- a/src/nnet3/nnet-component-itf.cc
+++ b/src/nnet3/nnet-component-itf.cc
@@ -55,7 +55,7 @@ ComponentPrecomputedIndexes* ComponentPrecomputedIndexes::NewComponentPrecompute
     ans = new StatisticsExtractionComponentPrecomputedIndexes();
   } else if (cpi_type == "StatisticsPoolingComponentPrecomputedIndexes") {
     ans = new StatisticsPoolingComponentPrecomputedIndexes();
-  }  
+  }
   if (ans != NULL) {
     KALDI_ASSERT(cpi_type == ans->Type());
   }
@@ -428,61 +428,6 @@ void NonlinearComponent::InitFromConfig(ConfigLine *cfl) {
 }
 
 
-void NonlinearComponent::RepairGradients(
-    bool measure_deriv,
-    BaseFloat default_lower_threshold,
-    BaseFloat default_upper_threshold,
-    CuMatrixBase<BaseFloat> *in_deriv) const {
-  const CuVector<double> &stats_src = (measure_deriv ? deriv_sum_ : value_sum_);
-  if (self_repair_scale_ == 0.0 || count_ == 0.0 || stats_src.Dim() != dim_)
-    return;
-  // we use this 'repair_probability' (hardcoded for now) to limit
-  // this code to running on about half of the minibatches.
-  BaseFloat repair_probability = 0.5;
-  if (RandUniform() > repair_probability)
-    return;
-
-  // check that the self-repair scale is in a reasonable range.
-  KALDI_ASSERT(self_repair_scale_ > 0.0 && self_repair_scale_ < 0.1);
-  BaseFloat unset = kUnsetThreshold; // -1000.0
-  BaseFloat lower_threshold = (self_repair_lower_threshold_ == unset ?
-                               default_lower_threshold :
-                               self_repair_lower_threshold_) *
-      count_,
-      upper_threshold = (self_repair_upper_threshold_ == unset ?
-                         default_upper_threshold :
-                         self_repair_upper_threshold_) *
-      count_;
-
-  CuMatrix<BaseFloat> storage(2, dim_ + 2, kUndefined);
-  CuSubVector<BaseFloat> thresholds_vec(storage.RowData(0) + dim_, 2);
-  CuSubMatrix<BaseFloat> stats_mat(storage, 0, 2, 0, dim_);
-  thresholds_vec(0) = -lower_threshold;
-  thresholds_vec(1) = -upper_threshold;
-  CuSubVector<BaseFloat> row0(stats_mat, 0);
-  CuSubVector<BaseFloat> row1(stats_mat, 1);
-
-  row0.CopyFromVec(stats_src);
-  row1.CopyFromVec(row0);
-  stats_mat.AddVecToCols(1.0, thresholds_vec, 1.0);
-  // now row0 equals stats - lower_threshold, and
-  //     row1 equals stats - upper_threshold.
-  stats_mat.ApplyHeaviside();
-  // now row0 equals (stats > lower_threshold ? 1 : 0), and
-  //     row1 equals (stats > upper_threshold ? 1 : 0).
-  // what we want is:
-  // self_repair_scale * ((stats <= lower_threshold ? 1 : 0) +
-  //                         (stats > upper_threshold ? -1 : 0)).
-  //
-  // we can get these in stats_mat.Row(0) by computing:
-  // -self_repair_scale * (stats_mat.Row(1)  + stats_mat.Row(0) - 1).
-  row0.AddVec(1.0, row1, 1.0);
-  row0.Add(-1.0);
-  // [actually we need to divide by repair_probability also, to
-  //  correct for the fact that we only do this on some frames.]
-  row0.Scale(-self_repair_scale_ / repair_probability);
-  in_deriv->AddVecToRows(1.0, row0, 1.0);
-}
 
 } // namespace nnet3
 } // namespace kaldi
diff --git a/src/nnet3/nnet-component-itf.h b/src/nnet3/nnet-component-itf.h
index 6ee702b7797..be78014c20b 100644
--- a/src/nnet3/nnet-component-itf.h
+++ b/src/nnet3/nnet-component-itf.h
@@ -494,13 +494,6 @@ class NonlinearComponent: public Component {
  protected:
   enum { kUnsetThreshold = -1000 };
 
-  // this function is to be called from Backprop code if it makes
-  // sense for the nonlinearity typte
-  void RepairGradients(bool measure_deriv,
-                       BaseFloat default_lower_threshold,
-                       BaseFloat default_upper_threshold,
-                       CuMatrixBase<BaseFloat> *in_deriv) const;
-
   friend class SigmoidComponent;
   friend class TanhComponent;
   friend class SoftmaxComponent;
diff --git a/src/nnet3/nnet-compute-test.cc b/src/nnet3/nnet-compute-test.cc
index 33e9ede3812..7fdb3dab982 100644
--- a/src/nnet3/nnet-compute-test.cc
+++ b/src/nnet3/nnet-compute-test.cc
@@ -34,19 +34,16 @@ void UnitTestNnetComputationIo(NnetComputation *computation) {
   computation->Write(os, binary);
   const std::string &original_output = os.str();
   std::istringstream computation_is(original_output);
-  KALDI_LOG << computation_is.str();
   computation->Read(computation_is, binary);
   std::istringstream computation_is2(original_output);
   NnetComputation computation2;
   computation2.Read(computation_is2, binary);
-    
+
   std::ostringstream os2, os3;
   computation->Write(os2, binary);
   computation2.Write(os3, binary);
-  
+
   if (binary) {
-    KALDI_LOG << os2.str();
-    KALDI_LOG << original_output;
     KALDI_ASSERT(os2.str() == original_output);
     KALDI_ASSERT(os3.str() == original_output);
   }
@@ -62,15 +59,13 @@ void UnitTestComputationRequestIo(ComputationRequest *request) {
   std::istringstream request_is2(original_output);
   ComputationRequest request2;
   request2.Read(request_is2, binary);
-    
+
   std::ostringstream os2, os3;
   request->Write(os2, binary);
   request2.Write(os3, binary);
   KALDI_ASSERT(*request == request2);
 
   if (binary) {
-    KALDI_LOG << os2.str();
-    KALDI_LOG << original_output;
     KALDI_ASSERT(os2.str() == original_output);
     KALDI_ASSERT(os3.str() == original_output);
   }
@@ -86,10 +81,10 @@ void TestNnetDecodable(const ComputationRequest &request,
 }
 
 void UnitTestNnetCompute() {
-  for (int32 n = 0; n < 20; n++) {    
+  for (int32 n = 0; n < 20; n++) {
     struct NnetGenerationOptions gen_config;
 
-    
+
     std::vector<std::string> configs;
     GenerateConfigSequence(gen_config, &configs);
     Nnet nnet;
@@ -102,7 +97,7 @@ void UnitTestNnetCompute() {
     ComputationRequest request;
     std::vector<Matrix<BaseFloat> > inputs;
     ComputeExampleComputationRequestSimple(nnet, &request, &inputs);
-    
+
     NnetComputation computation;
     Compiler compiler(request, nnet);
 
@@ -117,7 +112,7 @@ void UnitTestNnetCompute() {
     }
     CheckComputationOptions check_config;
     // we can do the rewrite check since it's before optimization.
-    check_config.check_rewrite = true;  
+    check_config.check_rewrite = true;
     ComputationChecker checker(check_config, nnet, computation);
     checker.Check();
 
@@ -135,7 +130,7 @@ void UnitTestNnetCompute() {
     NnetComputeOptions compute_opts;
     if (RandInt(0, 1) == 0)
       compute_opts.debug = true;
-    
+
     computation.ComputeCudaIndexes();
     NnetComputer computer(compute_opts,
                           computation,
@@ -151,7 +146,7 @@ void UnitTestNnetCompute() {
     const CuMatrixBase<BaseFloat> &output(computer.GetOutput("output"));
 
     TestNnetDecodable(request, inputs, nnet, output);
-    
+
     KALDI_LOG << "Output sum is " << output.Sum();
     CuMatrix<BaseFloat> output_deriv(output.NumRows(), output.NumCols());
     output_deriv.SetRandn();
diff --git a/src/nnet3/nnet-general-component.h b/src/nnet3/nnet-general-component.h
index edf6b993ddc..e7c2ff3a78e 100644
--- a/src/nnet3/nnet-general-component.h
+++ b/src/nnet3/nnet-general-component.h
@@ -142,11 +142,11 @@ class DistributeComponentPrecomputedIndexes:
   virtual ComponentPrecomputedIndexes* Copy() const {
     return new DistributeComponentPrecomputedIndexes(*this);
   }
-  
+
   virtual void Write(std::ostream &ostream, bool binary) const;
-  
+
   virtual void Read(std::istream &istream, bool binary);
-  
+
   virtual std::string Type() const { return "DistributeComponentPrecomputedIndexes"; }
 };
 
@@ -291,10 +291,10 @@ class StatisticsExtractionComponentPrecomputedIndexes:
   }
 
   virtual void Write(std::ostream &os, bool binary) const;
-  
+
   virtual void Read(std::istream &is, bool binary);
-  
-  virtual std::string Type() const { return "StaticticsExtractionComponentPrecomputedIndexes"; }
+
+  virtual std::string Type() const { return "StatisticsExtractionComponentPrecomputedIndexes"; }
  private:
   virtual ~StatisticsExtractionComponentPrecomputedIndexes() { }
 };
@@ -431,12 +431,12 @@ class StatisticsPoolingComponentPrecomputedIndexes:
   ComponentPrecomputedIndexes *Copy() const {
     return new StatisticsPoolingComponentPrecomputedIndexes(*this);
   }
-  
+
   virtual void Write(std::ostream &os, bool binary) const;
-  
+
   virtual void Read(std::istream &is, bool binary);
-  
-  virtual std::string Type() const { return "StaticticsPoolingComponentPrecomputedIndexes"; }
+
+  virtual std::string Type() const { return "StatisticsPoolingComponentPrecomputedIndexes"; }
 };
 
 } // namespace nnet3
diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc
index e97278f86dd..aadd0c05a1d 100644
--- a/src/nnet3/nnet-simple-component.cc
+++ b/src/nnet3/nnet-simple-component.cc
@@ -418,13 +418,14 @@ void NormalizeComponent::Backprop(const std::string &debug_info,
       in_deriv->AddDiagVecMat(1.0, in_norm, out_deriv_no_log, kNoTrans, 1.0);
     else
       in_deriv->MulRowsVec(in_norm);
+    in_norm.ReplaceValue(1.0 / sqrt(kSquaredNormFloor), 0.0);
+    in_norm.ApplyPow(3.0);
+    dot_products.MulElements(in_norm);
+
+    in_deriv->AddDiagVecMat(-1.0 / d_scaled,
+                            dot_products, in_value,
+                            kNoTrans, 1.0);
   }
-  in_norm.ReplaceValue(1.0 / sqrt(kSquaredNormFloor), 0.0);
-  in_norm.ApplyPow(3.0);
-  dot_products.MulElements(in_norm);
-  in_deriv->AddDiagVecMat(-1.0 / d_scaled,
-                          dot_products, in_value,
-                          kNoTrans, 1.0);
 }
 
 void SigmoidComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
@@ -442,10 +443,79 @@ void SigmoidComponent::Backprop(const std::string &debug_info,
                                 CuMatrixBase<BaseFloat> *in_deriv) const {
   if (in_deriv != NULL) {
     in_deriv->DiffSigmoid(out_value, out_deriv);
-    RepairGradients(false, 0.025, 0.975, in_deriv);
+    RepairGradients(out_value, in_deriv);
   }
 }
 
+void SigmoidComponent::RepairGradients(
+    const CuMatrixBase<BaseFloat> &out_value,
+    CuMatrixBase<BaseFloat> *in_deriv) const {
+  // maximum possible derivative of SigmoidComponent is 0.25.
+  // the default lower-threshold on the derivative, below which we
+  // add a term to the derivative to encourage the inputs to the sigmoid
+  // to be closer to zero, is 0.05, which means the derivative is on average
+  // 5 times smaller than its maximum possible value.
+  BaseFloat default_lower_threshold = 0.05;
+
+  // we use this 'repair_probability' (hardcoded for now) to limit
+  // this code to running on about half of the minibatches.
+  BaseFloat repair_probability = 0.5;
+
+  if (self_repair_scale_ == 0.0 || count_ == 0.0 || deriv_sum_.Dim() != dim_ ||
+      RandUniform() > repair_probability)
+    return;
+
+  // check that the self-repair scale is in a reasonable range.
+  KALDI_ASSERT(self_repair_scale_ > 0.0 && self_repair_scale_ < 0.1);
+  BaseFloat unset = kUnsetThreshold; // -1000.0
+  BaseFloat lower_threshold = (self_repair_lower_threshold_ == unset ?
+                               default_lower_threshold :
+                               self_repair_lower_threshold_) *
+      count_;
+  if (self_repair_upper_threshold_ != unset) {
+    KALDI_ERR << "Do not set the self-repair-upper-threshold for sigmoid "
+              << "components, it does nothing.";
+  }
+
+  // thresholds_vec is actually a 1-row matrix.  (the ApplyHeaviside
+  // function isn't defined for vectors).
+  CuMatrix<BaseFloat> thresholds(1, dim_);
+  CuSubVector<BaseFloat> thresholds_vec(thresholds, 0);
+  thresholds_vec.AddVec(-1.0, deriv_sum_);
+  thresholds_vec.Add(lower_threshold);
+  thresholds.ApplyHeaviside();
+
+  // At this point, 'thresholds_vec' contains a 1 for each dimension of
+  // the output that is 'problematic', i.e. for which the avg-deriv
+  // is less than the self-repair lower threshold, and a 0 for
+  // each dimension that is not problematic.
+
+  // what we want to do is to add
+  // -self_repair_scale_ / repair_probability times (2 * output-valiue - 1.0)
+  // to the input derivative for each problematic dimension.
+
+  // Here, 2 * output - 1.0 is a version of the sigmoid that goes from -1.0 to
+  // 1.0, like a tanh.  the negative sign is so that for inputs <0, we push them
+  // up towards 0, and for inputs >0, we push them down towards 0.
+  // Our use of this sigmoid-type function here is just a convenience since
+  // we have it available.  We could use just about any function that is positive
+  // for inputs < 0 and negative for inputs > 0.
+
+  // We can rearrange the above as: for only the problematic columns,
+  //   input-deriv -= 2 * self-repair-scale / repair-probabilty * output
+  //   input-deriv +=  self-repair-scale / repair-probabilty
+  // which we can write as:
+  //   input-deriv -= 2 * self-repair-scale / repair-probabilty * output * thresholds-vec
+  //   input-deriv +=  self-repair-scale / repair-probabilty * thresholds-vec
+
+  in_deriv->AddMatDiagVec(-2.0 * self_repair_scale_ / repair_probability,
+                          out_value, kNoTrans, thresholds_vec);
+  in_deriv->AddVecToCols(self_repair_scale_ / repair_probability,
+                         thresholds_vec);
+}
+
+
+
 void SigmoidComponent::StoreStats(const CuMatrixBase<BaseFloat> &out_value) {
   // only store stats about every other minibatch.
   if (RandInt(0, 1) == 0)
@@ -628,6 +698,68 @@ void TanhComponent::Propagate(const ComponentPrecomputedIndexes *indexes,
   out->Tanh(in);
 }
 
+
+void TanhComponent::RepairGradients(
+    const CuMatrixBase<BaseFloat> &out_value,
+    CuMatrixBase<BaseFloat> *in_deriv) const {
+  // maximum possible derivative of SigmoidComponent is 1.0
+  // the default lower-threshold on the derivative, below which we
+  // add a term to the derivative to encourage the inputs to the sigmoid
+  // to be closer to zero, is 0.2, which means the derivative is on average
+  // 5 times smaller than its maximum possible value.
+  BaseFloat default_lower_threshold = 0.2;
+
+  // we use this 'repair_probability' (hardcoded for now) to limit
+  // this code to running on about half of the minibatches.
+  BaseFloat repair_probability = 0.5;
+
+  if (self_repair_scale_ == 0.0 || count_ == 0.0 || deriv_sum_.Dim() != dim_ ||
+      RandUniform() > repair_probability)
+    return;
+
+  // check that the self-repair scale is in a reasonable range.
+  KALDI_ASSERT(self_repair_scale_ > 0.0 && self_repair_scale_ < 0.1);
+  BaseFloat unset = kUnsetThreshold; // -1000.0
+  BaseFloat lower_threshold = (self_repair_lower_threshold_ == unset ?
+                               default_lower_threshold :
+                               self_repair_lower_threshold_) *
+      count_;
+  if (self_repair_upper_threshold_ != unset) {
+    KALDI_ERR << "Do not set the self-repair-upper-threshold for sigmoid "
+              << "components, it does nothing.";
+  }
+
+  // thresholds_vec is actually a 1-row matrix.  (the ApplyHeaviside
+  // function isn't defined for vectors).
+  CuMatrix<BaseFloat> thresholds(1, dim_);
+  CuSubVector<BaseFloat> thresholds_vec(thresholds, 0);
+  thresholds_vec.AddVec(-1.0, deriv_sum_);
+  thresholds_vec.Add(lower_threshold);
+  thresholds.ApplyHeaviside();
+
+  // At this point, 'thresholds_vec' contains a 1 for each dimension of
+  // the output that is 'problematic', i.e. for which the avg-deriv
+  // is less than the self-repair lower threshold, and a 0 for
+  // each dimension that is not problematic.
+
+  // what we want to do is to add -self_repair_scale_ / repair_probability times
+  // output-valiue) to the input derivative for each problematic dimension.
+  // note that for the tanh, the output-value goes from -1.0 when the input is
+  // -inf to +1.0 when the input is +inf.  The negative sign is so that for
+  // inputs <0, we push them up towards 0, and for inputs >0, we push them down
+  // towards 0.  Our use of the tanh here is just a convenience since we have it
+  // available.  We could use just about any function that is positive for
+  // inputs < 0 and negative for inputs > 0.
+
+  // We can rearrange the above as: for only the problematic columns,
+  //   input-deriv -= self-repair-scale / repair-probabilty * output
+  // which we can write as:
+  //   input-deriv -=  self-repair-scale / repair-probabilty * output * thresholds-vec
+
+  in_deriv->AddMatDiagVec(-self_repair_scale_ / repair_probability,
+                          out_value, kNoTrans, thresholds_vec);
+}
+
 void TanhComponent::Backprop(const std::string &debug_info,
                              const ComponentPrecomputedIndexes *indexes,
                              const CuMatrixBase<BaseFloat> &,
@@ -638,7 +770,7 @@ void TanhComponent::Backprop(const std::string &debug_info,
                              CuMatrixBase<BaseFloat> *in_deriv) const {
   if (in_deriv != NULL) {
     in_deriv->DiffTanh(out_value, out_deriv);
-    RepairGradients(false, -0.95, 0.95, in_deriv);
+    RepairGradients(out_value, in_deriv);
   }
 }
 
@@ -681,10 +813,66 @@ void RectifiedLinearComponent::Backprop(
   if (in_deriv != NULL) {
     in_deriv->Heaviside(out_value);
     in_deriv->MulElements(out_deriv);
-    RepairGradients(true, 0.05, 0.95, in_deriv);
+    RepairGradients(in_deriv);
   }
 }
 
+
+void RectifiedLinearComponent::RepairGradients(
+    CuMatrixBase<BaseFloat> *in_deriv) const {
+  BaseFloat default_lower_threshold = 0.05,
+      default_upper_threshold = 0.95;
+  // we use this 'repair_probability' (hardcoded for now) to limit
+  // this code to running on about half of the minibatches.
+  BaseFloat repair_probability = 0.5;
+
+  if (self_repair_scale_ == 0.0 || count_ == 0.0 || deriv_sum_.Dim() != dim_ ||
+      RandUniform() > repair_probability)
+    return;
+
+  // check that the self-repair scale is in a reasonable range.
+  KALDI_ASSERT(self_repair_scale_ > 0.0 && self_repair_scale_ < 0.1);
+  BaseFloat unset = kUnsetThreshold; // -1000.0
+  BaseFloat lower_threshold = (self_repair_lower_threshold_ == unset ?
+                               default_lower_threshold :
+                               self_repair_lower_threshold_) *
+      count_,
+      upper_threshold = (self_repair_upper_threshold_ == unset ?
+                         default_upper_threshold :
+                         self_repair_upper_threshold_) *
+      count_;
+
+  CuMatrix<BaseFloat> storage(2, dim_ + 2, kUndefined);
+  CuSubVector<BaseFloat> thresholds_vec(storage.RowData(0) + dim_, 2);
+  CuSubMatrix<BaseFloat> stats_mat(storage, 0, 2, 0, dim_);
+  thresholds_vec(0) = -lower_threshold;
+  thresholds_vec(1) = -upper_threshold;
+  CuSubVector<BaseFloat> row0(stats_mat, 0);
+  CuSubVector<BaseFloat> row1(stats_mat, 1);
+
+  row0.CopyFromVec(deriv_sum_);
+  row1.CopyFromVec(row0);
+  stats_mat.AddVecToCols(1.0, thresholds_vec, 1.0);
+  // now row0 equals stats - lower_threshold, and
+  //     row1 equals stats - upper_threshold.
+  stats_mat.ApplyHeaviside();
+  // now row0 equals (stats > lower_threshold ? 1 : 0), and
+  //     row1 equals (stats > upper_threshold ? 1 : 0).
+  // what we want is:
+  // self_repair_scale * ((stats <= lower_threshold ? 1 : 0) +
+  //                         (stats > upper_threshold ? -1 : 0)).
+  //
+  // we can get these in stats_mat.Row(0) by computing:
+  // -self_repair_scale * (stats_mat.Row(1)  + stats_mat.Row(0) - 1).
+  row0.AddVec(1.0, row1, 1.0);
+  row0.Add(-1.0);
+  // [actually we need to divide by repair_probability also, to
+  //  correct for the fact that we only do this on some frames.]
+  row0.Scale(-self_repair_scale_ / repair_probability);
+  in_deriv->AddVecToRows(1.0, row0, 1.0);
+}
+
+
 void RectifiedLinearComponent::StoreStats(
     const CuMatrixBase<BaseFloat> &out_value) {
   // only store stats about every other minibatch.
diff --git a/src/nnet3/nnet-simple-component.h b/src/nnet3/nnet-simple-component.h
index c3a3048202f..d8295ac10e5 100644
--- a/src/nnet3/nnet-simple-component.h
+++ b/src/nnet3/nnet-simple-component.h
@@ -190,6 +190,11 @@ class SigmoidComponent: public NonlinearComponent {
                         CuMatrixBase<BaseFloat> *in_deriv) const;
   virtual void StoreStats(const CuMatrixBase<BaseFloat> &out_value);
  private:
+  // this function is called from Backprop code and only does something if the
+  // self-repair-scale config value is set.
+  void RepairGradients(const CuMatrixBase<BaseFloat> &out_value,
+                       CuMatrixBase<BaseFloat> *in_deriv) const;
+
   SigmoidComponent &operator = (const SigmoidComponent &other); // Disallow.
 };
 
@@ -214,6 +219,11 @@ class TanhComponent: public NonlinearComponent {
                         CuMatrixBase<BaseFloat> *in_deriv) const;
   virtual void StoreStats(const CuMatrixBase<BaseFloat> &out_value);
  private:
+  // this function is called from Backprop code and only does something if the
+  // self-repair-scale config value is set.
+  void RepairGradients(const CuMatrixBase<BaseFloat> &out_value,
+                       CuMatrixBase<BaseFloat> *in_deriv) const;
+
   TanhComponent &operator = (const TanhComponent &other); // Disallow.
 };
 
@@ -242,6 +252,10 @@ class RectifiedLinearComponent: public NonlinearComponent {
   virtual void StoreStats(const CuMatrixBase<BaseFloat> &out_value);
 
  private:
+  // this function is called from Backprop code and only does something if the
+  // self-repair-scale config value is set.
+  void RepairGradients(CuMatrixBase<BaseFloat> *in_deriv) const;
+
   RectifiedLinearComponent &operator = (const RectifiedLinearComponent &other); // Disallow.
 };
 
diff --git a/src/nnet3/nnet-test-utils.cc b/src/nnet3/nnet-test-utils.cc
index 5a02aa7da02..933808dc61c 100644
--- a/src/nnet3/nnet-test-utils.cc
+++ b/src/nnet3/nnet-test-utils.cc
@@ -942,7 +942,9 @@ static void GenerateRandomComponentConfig(std::string *component_type,
       BaseFloat target_rms = (RandInt(1, 200) / 100.0);
       std::string add_log_stddev = (Rand() % 2 == 0 ? "True" : "False");
       *component_type = "NormalizeComponent";
-      os << "dim=" << RandInt(1, 50)
+      // avoid dim=1 because the derivatives would be zero, which
+      // makes them hard to test.
+      os << "dim=" << RandInt(2, 50)
          << " target-rms=" << target_rms
          << " add-log-stddev=" << add_log_stddev;
       break;

From 7bc34fe909a3c324349c6e88006f7fdb45adedcb Mon Sep 17 00:00:00 2001
From: Dan Povey <dpovey@gmail.com>
Date: Thu, 18 Feb 2016 15:38:58 -0500
Subject: [PATCH 19/32] cosmetic change: fix 'score' to 'cost'

---
 src/latbin/lattice-best-path.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/latbin/lattice-best-path.cc b/src/latbin/lattice-best-path.cc
index dda41cd0604..dc25fb351c6 100644
--- a/src/latbin/lattice-best-path.cc
+++ b/src/latbin/lattice-best-path.cc
@@ -121,7 +121,7 @@ int main(int argc, char *argv[]) {
     }
 
     BaseFloat tot_weight_float = tot_weight.Value1() + tot_weight.Value2();
-    KALDI_LOG << "Overall score per frame is " << (tot_weight_float/n_frame)
+    KALDI_LOG << "Overall cost per frame is " << (tot_weight_float/n_frame)
               << " = " << (tot_weight.Value1()/n_frame) << " [graph]"
               << " + " << (tot_weight.Value2()/n_frame) << " [acoustic]"
               << " over " << n_frame << " frames.";

From b3bbc038681b5bcd6be02481785f929b5b99e11a Mon Sep 17 00:00:00 2001
From: Dan Povey <dpovey@gmail.com>
Date: Thu, 18 Feb 2016 16:58:46 -0500
Subject: [PATCH 20/32] modifying cmd.sh in example recipes to encourage the
 use of new-style queue options and conf/queue.conf

---
 egs/ami/s5/cmd.sh                     | 43 ++++++++++----------
 egs/ami/s5/run_ihm.sh                 | 26 ++++++------
 egs/aurora4/s5/cmd.sh                 | 43 ++++++++------------
 egs/babel/s5/cmd.sh                   | 44 +++++++--------------
 egs/babel/s5b/cmd.sh                  | 44 +++++++--------------
 egs/babel/s5c/cmd.sh                  | 44 +++++++--------------
 egs/bn_music_speech/v1/cmd.sh         | 28 ++++++-------
 egs/callhome_egyptian/s5/cmd.sh       | 33 +++++++---------
 egs/chime1/s5/cmd.sh                  | 57 +++++++++------------------
 egs/chime2/s5/cmd.sh                  | 44 ++++++++-------------
 egs/csj/s5/cmd.sh                     | 46 +++++++--------------
 egs/farsdat/s5/cmd.sh                 | 40 +++++++------------
 egs/fisher_callhome_spanish/s5/cmd.sh | 33 +++++++---------
 egs/fisher_english/s5/cmd.sh          | 44 +++++++--------------
 egs/fisher_swbd/s5/cmd.sh             | 15 ++-----
 egs/gale_arabic/s5/cmd.sh             | 24 ++++++-----
 egs/gale_mandarin/s5/cmd.sh           | 27 ++++++++-----
 egs/hkust/s5/cmd.sh                   | 26 ++++++------
 egs/librispeech/s5/cmd.sh             | 45 +++++++--------------
 egs/lre/v1/cmd.sh                     | 39 ++++++------------
 egs/lre07/v1/cmd.sh                   | 39 ++++++------------
 egs/reverb/s5/cmd.sh                  | 44 +++++++--------------
 egs/rm/s5/cmd.sh                      | 37 ++++++++---------
 egs/sprakbanken/s5/cmd.sh             | 45 +++++++--------------
 egs/sre08/v1/cmd.sh                   | 39 ++++++------------
 egs/sre10/v1/cmd.sh                   | 39 ++++++------------
 egs/swbd/s5/cmd.sh                    | 40 +++++++------------
 egs/swbd/s5b/cmd.sh                   | 43 +++++++-------------
 egs/swbd/s5c/cmd.sh                   | 32 +++++++++------
 egs/thchs30/s5/cmd.sh                 |  3 +-
 egs/tidigits/s5/cmd.sh                | 29 +++++++-------
 egs/timit/s5/cmd.sh                   | 47 ++++++++++------------
 egs/voxforge/s5/cmd.sh                | 29 +++++++-------
 egs/vystadial_cz/s5/cmd.sh            | 38 +++++++++---------
 egs/vystadial_en/s5/cmd.sh            | 38 +++++++++---------
 egs/wsj/s5/cmd.sh                     | 36 ++++++++---------
 egs/wsj/s5/local/run_kl_hmm.sh        |  2 +
 37 files changed, 539 insertions(+), 786 deletions(-)

diff --git a/egs/ami/s5/cmd.sh b/egs/ami/s5/cmd.sh
index c3ac80d6846..5ec5d4b715f 100644
--- a/egs/ami/s5/cmd.sh
+++ b/egs/ami/s5/cmd.sh
@@ -1,9 +1,24 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 1G"
+export decode_cmd="queue.pl --mem 2G"
+# the use of cuda_cmd is deprecated but it is sometimes still used in nnet1
+# scripts.
+export cuda_cmd="queue.pl --gpu 1 --mem 20G"
+
+# the rest of this file is present for historical reasons.
+# In general it's best to rely on conf/queue.conf for cluster-specific
+# configuration.
 
 # On Eddie use:
 #export train_cmd="queue.pl -P inf_hcrc_cstr_nst -l h_rt=08:00:00"
@@ -11,20 +26,6 @@
 #export highmem_cmd="queue.pl -P inf_hcrc_cstr_nst -l h_rt=05:00:00 -pe memory-2G 4"
 #export scoring_cmd="queue.pl -P inf_hcrc_cstr_nst  -l h_rt=00:20:00"
 
-# JSALT2015 workshop, cluster AWS-EC2, (setup from Vijay)
-export train_cmd="queue.pl -l arch=*64* --mem 1G"
-export decode_cmd="queue.pl -l arch=*64* --mem 2G"
-export highmem_cmd="queue.pl -l arch=*64* --mem 4G"
-export scoring_cmd="queue.pl -l arch=*64*"
-export cuda_cmd="queue.pl --gpu 1 -l mem_free=20G,ram_free=20G"
-export cntk_decode_cmd="queue.pl -l arch=*64* --mem 1G -pe smp 2"
-
-# To run locally, use:
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-#export highmem_cmd=run.pl
-#export cuda_cmd=run.pl
-
 if [ "$(hostname -d)" == "fit.vutbr.cz" ]; then
   # BUT cluster:
   queue="all.q@@blade,all.q@@speech"
@@ -33,5 +34,5 @@ if [ "$(hostname -d)" == "fit.vutbr.cz" ]; then
   export train_cmd="queue.pl -q $queue -l ram_free=1.5G,mem_free=1.5G,${storage}=1"
   export decode_cmd="queue.pl -q $queue -l ram_free=2.5G,mem_free=2.5G,${storage}=0.5"
   export cuda_cmd="queue.pl -q $gpu_queue -l gpu=1"
-fi 
+fi
 
diff --git a/egs/ami/s5/run_ihm.sh b/egs/ami/s5/run_ihm.sh
index b4d41d7066a..b9d60d78182 100755
--- a/egs/ami/s5/run_ihm.sh
+++ b/egs/ami/s5/run_ihm.sh
@@ -10,13 +10,13 @@ mic=ihm
 stage=0
 . utils/parse_options.sh
 
-# Set bash to 'debug' mode, it prints the commands (option '-x') and exits on : 
+# Set bash to 'debug' mode, it prints the commands (option '-x') and exits on :
 # -e 'error', -u 'undefined variable', -o pipefail 'error in pipeline',
 set -euxo pipefail
 
 # Path where AMI gets downloaded (or where locally available):
-AMI_DIR=$PWD/wav_db # Default, 
-case $(hostname -d) in 
+AMI_DIR=$PWD/wav_db # Default,
+case $(hostname -d) in
   fit.vutbr.cz) AMI_DIR=/mnt/scratch05/iveselyk/KALDI_AMI_WAV ;; # BUT,
   clsp.jhu.edu) AMI_DIR=/export/corpora4/ami/amicorpus ;; # JHU,
   cstr.ed.ac.uk) AMI_DIR= ;; # Edinburgh,
@@ -86,7 +86,7 @@ if [ $stage -le 5 ]; then
     data/$mic/train data/lang exp/$mic/tri2a exp/$mic/tri2_ali
   # Decode,
   graph_dir=exp/$mic/tri2a/graph_${LM}
-  $highmem_cmd $graph_dir/mkgraph.log \
+  $cmd --mem 4G $graph_dir/mkgraph.log \
     utils/mkgraph.sh data/lang_${LM} exp/$mic/tri2a $graph_dir
   steps/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.conf \
     $graph_dir data/$mic/dev exp/$mic/tri2a/decode_dev_${LM}
@@ -104,26 +104,26 @@ if [ $stage -le 6 ]; then
     data/$mic/train data/lang exp/$mic/tri3a exp/$mic/tri3a_ali
   # Decode,
   graph_dir=exp/$mic/tri3a/graph_${LM}
-  $highmem_cmd $graph_dir/mkgraph.log \
+  $cmd --mem 4G $graph_dir/mkgraph.log \
     utils/mkgraph.sh data/lang_${LM} exp/$mic/tri3a $graph_dir
   steps/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.conf \
-    $graph_dir data/$mic/dev exp/$mic/tri3a/decode_dev_${LM} 
+    $graph_dir data/$mic/dev exp/$mic/tri3a/decode_dev_${LM}
   steps/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.conf \
     $graph_dir data/$mic/eval exp/$mic/tri3a/decode_eval_${LM}
-fi 
+fi
 
 if [ $stage -le 7 ]; then
   # Train tri4a, which is LDA+MLLT+SAT,
   steps/train_sat.sh  --cmd "$train_cmd" \
     5000 80000 data/$mic/train data/lang exp/$mic/tri3a_ali exp/$mic/tri4a
-  # Decode,  
+  # Decode,
   graph_dir=exp/$mic/tri4a/graph_${LM}
   $highmem_cmd $graph_dir/mkgraph.log \
     utils/mkgraph.sh data/lang_${LM} exp/$mic/tri4a $graph_dir
   steps/decode_fmllr.sh --nj $nj --cmd "$decode_cmd"  --config conf/decode.conf \
-    $graph_dir data/$mic/dev exp/$mic/tri4a/decode_dev_${LM} 
+    $graph_dir data/$mic/dev exp/$mic/tri4a/decode_dev_${LM}
   steps/decode_fmllr.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.conf \
-    $graph_dir data/$mic/eval exp/$mic/tri4a/decode_eval_${LM} 
+    $graph_dir data/$mic/eval exp/$mic/tri4a/decode_eval_${LM}
 fi
 
 nj_mmi=80
@@ -160,11 +160,11 @@ if [ $stage -le 11 ]; then
     decode_dir=exp/$mic/tri4a_mmi_b0.1/decode_dev_${i}.mdl_${LM}
     steps/decode.sh --nj $nj --cmd "$decode_cmd" --config conf/decode.conf \
       --transform-dir exp/$mic/tri4a/decode_dev_${LM} --iter $i \
-      $graph_dir data/$mic/dev $decode_dir 
+      $graph_dir data/$mic/dev $decode_dir
     decode_dir=exp/$mic/tri4a_mmi_b0.1/decode_eval_${i}.mdl_${LM}
     steps/decode.sh --nj $nj --cmd "$decode_cmd"  --config conf/decode.conf \
       --transform-dir exp/$mic/tri4a/decode_eval_${LM} --iter $i \
-      $graph_dir data/$mic/eval $decode_dir 
+      $graph_dir data/$mic/eval $decode_dir
   done
 fi
 
@@ -181,7 +181,7 @@ if [ $stage -le 13 ]; then
     --hidden-dim 950 \
     --splice-indexes "layer0/-2:-1:0:1:2 layer1/-1:2 layer2/-3:3 layer3/-7:2 layer4/-3:3" \
     --use-sat-alignments true
-  
+
   local/online/run_nnet2_ms_sp_disc.sh  \
     --mic $mic  \
     --gmm-dir exp/$mic/tri4a \
diff --git a/egs/aurora4/s5/cmd.sh b/egs/aurora4/s5/cmd.sh
index 139b2cd6c6c..378febca15b 100644
--- a/egs/aurora4/s5/cmd.sh
+++ b/egs/aurora4/s5/cmd.sh
@@ -1,29 +1,18 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
 
-#a) JHU cluster options
-export train_cmd="queue.pl -l arch=*64"
-export decode_cmd="queue.pl -l arch=*64 --mem 2G"
-export mkgraph_cmd="queue.pl -l arch=*64 --mem 4G"
-export big_memory_cmd="queue.pl -l arch=*64 --mem 8G"
+export train_cmd="queue.pl --mem 4G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
+# the use of cuda_cmd is deprecated but it's still used in some example scripts
+# here.
 export cuda_cmd="queue.pl --gpu 1"
-
-
-#b) BUT cluster options
-#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
-#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M"
-#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G"
-
-#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1"
-#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu"
-#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G"
-
-#c) run it locally...
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-#export cuda_cmd=run.pl
-#export mkgraph_cmd=run.pl
diff --git a/egs/babel/s5/cmd.sh b/egs/babel/s5/cmd.sh
index a4a11bef039..71dd849a93b 100644
--- a/egs/babel/s5/cmd.sh
+++ b/egs/babel/s5/cmd.sh
@@ -1,29 +1,15 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-#a) JHU cluster options
-export train_cmd="queue.pl -l arch=*64"
-export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G"
-export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G"
-
-#export cuda_cmd="..."
-
-
-#b) BUT cluster options
-#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
-#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M"
-#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G"
-
-#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1"
-#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu"
-#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G"
-
-#c) run it locally...
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-#export cuda_cmd=run.pl
-#export mkgraph_cmd=run.pl
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/babel/s5b/cmd.sh b/egs/babel/s5b/cmd.sh
index a4a11bef039..88db78823a5 100644
--- a/egs/babel/s5b/cmd.sh
+++ b/egs/babel/s5b/cmd.sh
@@ -1,29 +1,15 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-#a) JHU cluster options
-export train_cmd="queue.pl -l arch=*64"
-export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G"
-export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G"
-
-#export cuda_cmd="..."
-
-
-#b) BUT cluster options
-#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
-#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M"
-#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G"
-
-#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1"
-#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu"
-#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G"
-
-#c) run it locally...
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-#export cuda_cmd=run.pl
-#export mkgraph_cmd=run.pl
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 4G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/babel/s5c/cmd.sh b/egs/babel/s5c/cmd.sh
index a4a11bef039..71dd849a93b 100644
--- a/egs/babel/s5c/cmd.sh
+++ b/egs/babel/s5c/cmd.sh
@@ -1,29 +1,15 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-#a) JHU cluster options
-export train_cmd="queue.pl -l arch=*64"
-export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G"
-export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G"
-
-#export cuda_cmd="..."
-
-
-#b) BUT cluster options
-#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
-#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M"
-#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G"
-
-#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1"
-#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu"
-#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G"
-
-#c) run it locally...
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-#export cuda_cmd=run.pl
-#export mkgraph_cmd=run.pl
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/bn_music_speech/v1/cmd.sh b/egs/bn_music_speech/v1/cmd.sh
index 27d1d36a6a6..d1ca1a6d126 100755
--- a/egs/bn_music_speech/v1/cmd.sh
+++ b/egs/bn_music_speech/v1/cmd.sh
@@ -1,17 +1,15 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-#a) JHU cluster options
-export train_cmd="queue.pl -l arch=*64*"
-
-#b) BUT cluster options
-#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
-
-#c) run it locally...
-#export train_cmd=run.pl
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 4G"
 
 
diff --git a/egs/callhome_egyptian/s5/cmd.sh b/egs/callhome_egyptian/s5/cmd.sh
index ab29f13d4cc..71dd849a93b 100755
--- a/egs/callhome_egyptian/s5/cmd.sh
+++ b/egs/callhome_egyptian/s5/cmd.sh
@@ -1,18 +1,15 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-#train_cmd='queue.pl -q all.q@a03.clsp.jhu.edu,all.q@a06.clsp.jhu.edu,all.q@a05.clsp.jhu.edu,all.q@v01.clsp.jhu.edu,all.q@a10.clsp.jhu.edu,all.q@a04.clsp.jhu.edu,all.q@a13.clsp.jhu.edu,all.q@a11.clsp.jhu.edu -l arch=*64'
-#decode_cmd='queue.pl -q all.q@a03.clsp.jhu.edu,all.q@a06.clsp.jhu.edu,all.q@a05.clsp.jhu.edu,all.q@v01.clsp.jhu.edu,all.q@a10.clsp.jhu.edu,all.q@a04.clsp.jhu.edu,all.q@a13.clsp.jhu.edu,all.q@a11.clsp.jhu.edu -l arch=*64'
-train_cmd="queue.pl -l arch=*64"
-decode_cmd="queue.pl -l arch=*64"
-#train_cmd="run.pl"
-# Do training locally.  Note: for jobs on smallish subsets,
-# it's way faster to run on a single machine with a handful of CPUs, as
-# you avoid the latency of starting GridEngine jobs.
-
-
-
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/chime1/s5/cmd.sh b/egs/chime1/s5/cmd.sh
index dda6226f419..0dcd5a9200f 100755
--- a/egs/chime1/s5/cmd.sh
+++ b/egs/chime1/s5/cmd.sh
@@ -1,39 +1,18 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-#a) JHU cluster options
-#export train_cmd="queue.pl -l arch=*64"
-#export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G"
-#export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G"
-
-#export cuda_cmd="..."
-
-
-#b) BUT cluster options
-#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
-#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M"
-#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G"
-
-#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1"
-#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu"
-#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G"
-
-
-#c) USFD cluster options
-#config="conf/queue_usfd.conf"
-#export train_cmd="queue.pl  --config $config --mem 8G --rmem 4G"
-#export decode_cmd="queue.pl  --config $config --mem 8G --rmem 4G"
-#export mkgraph_cmd="queue.pl  --config $config --mem 8G --rmem 4G"
-#export cuda_cmd="queue.pl  --config $config --mem 24G --rmem 20G --gpu 1 --time 24:00:00"
-
-
-#d) run it locally...
-export train_cmd=run.pl
-export decode_cmd=run.pl
-export cuda_cmd=run.pl
-export mkgraph_cmd=run.pl
-
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
+
+# the use of cuda_cmd is deprecated, but it's still used in this recipe.
+export cuda_cmd="queue.pl --gpu 1"
diff --git a/egs/chime2/s5/cmd.sh b/egs/chime2/s5/cmd.sh
index 8bb00fe0ec6..0dcd5a9200f 100644
--- a/egs/chime2/s5/cmd.sh
+++ b/egs/chime2/s5/cmd.sh
@@ -1,30 +1,18 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
 
-#a) JHU cluster options
-export train_cmd="queue.pl -l arch=*64"
-export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G"
-export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G"
-export big_memory_cmd="queue.pl -l arch=*64,ram_free=8G,mem_free=8G"
-export cuda_cmd="queue.pl -l gpu=1"
-#export cuda_cmd="..."
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
 
-
-#b) BUT cluster options
-#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
-#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M"
-#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G"
-
-#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1"
-#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu"
-#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G"
-
-#c) run it locally...
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-export cuda_cmd=run.pl
-#export mkgraph_cmd=run.pl
+# the use of cuda_cmd is deprecated, but it's still used in this recipe.
+export cuda_cmd="queue.pl --gpu 1"
diff --git a/egs/csj/s5/cmd.sh b/egs/csj/s5/cmd.sh
index d5952fe0f87..71dd849a93b 100644
--- a/egs/csj/s5/cmd.sh
+++ b/egs/csj/s5/cmd.sh
@@ -1,31 +1,15 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-#a) JHU cluster options
-#export train_cmd="queue.pl -l arch=*64*"
-#export decode_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G"
-export train_cmd="run.pl"
-export decode_cmd="run.pl"
-#export cuda_cmd="..."
-#export mkgraph_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G"
-export mkgraph_cmd="run.pl"
-
-#b) BUT cluster options
-#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
-#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M"
-#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G"
-#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1"
-#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu"
-#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G"
-
-#c) run it locally...
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-export cuda_cmd=run.pl
-#export mkgraph_cmd=run.pl
-
-
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/farsdat/s5/cmd.sh b/egs/farsdat/s5/cmd.sh
index d749f2c9f1f..71dd849a93b 100644
--- a/egs/farsdat/s5/cmd.sh
+++ b/egs/farsdat/s5/cmd.sh
@@ -1,25 +1,15 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-#a) JHU cluster options
-export train_cmd="queue.pl -l arch=*64"
-export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G"
-export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G"
-export cuda_cmd="run.pl"
-
-
-#b) BUT cluster options
-#export train_cmd="queue.pl -q all.q@blade[01][0126789][123456789] -l ram_free=2500M,mem_free=2500M,matylda5=0.5"
-#export decode_cmd="queue.pl -q all.q@blade[01][0126789][123456789] -l ram_free=3000M,mem_free=3000M,matylda5=0.1"
-#export mkgraph_cmd="queue.pl -q all.q@blade[01][0126789][123456789] -l ram_free=4G,mem_free=4G,matylda5=3"
-#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu,long.q@dellgpu*,long.q@pco203-0[0124] -l gpu=1" 
-
-#c) run locally...
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-#export cuda_cmd=run.pl
-#export mkgraph_cmd=run.pl
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/fisher_callhome_spanish/s5/cmd.sh b/egs/fisher_callhome_spanish/s5/cmd.sh
index ab29f13d4cc..88db78823a5 100755
--- a/egs/fisher_callhome_spanish/s5/cmd.sh
+++ b/egs/fisher_callhome_spanish/s5/cmd.sh
@@ -1,18 +1,15 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-#train_cmd='queue.pl -q all.q@a03.clsp.jhu.edu,all.q@a06.clsp.jhu.edu,all.q@a05.clsp.jhu.edu,all.q@v01.clsp.jhu.edu,all.q@a10.clsp.jhu.edu,all.q@a04.clsp.jhu.edu,all.q@a13.clsp.jhu.edu,all.q@a11.clsp.jhu.edu -l arch=*64'
-#decode_cmd='queue.pl -q all.q@a03.clsp.jhu.edu,all.q@a06.clsp.jhu.edu,all.q@a05.clsp.jhu.edu,all.q@v01.clsp.jhu.edu,all.q@a10.clsp.jhu.edu,all.q@a04.clsp.jhu.edu,all.q@a13.clsp.jhu.edu,all.q@a11.clsp.jhu.edu -l arch=*64'
-train_cmd="queue.pl -l arch=*64"
-decode_cmd="queue.pl -l arch=*64"
-#train_cmd="run.pl"
-# Do training locally.  Note: for jobs on smallish subsets,
-# it's way faster to run on a single machine with a handful of CPUs, as
-# you avoid the latency of starting GridEngine jobs.
-
-
-
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 4G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/fisher_english/s5/cmd.sh b/egs/fisher_english/s5/cmd.sh
index a4a11bef039..88db78823a5 100644
--- a/egs/fisher_english/s5/cmd.sh
+++ b/egs/fisher_english/s5/cmd.sh
@@ -1,29 +1,15 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-#a) JHU cluster options
-export train_cmd="queue.pl -l arch=*64"
-export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G"
-export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G"
-
-#export cuda_cmd="..."
-
-
-#b) BUT cluster options
-#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
-#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M"
-#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G"
-
-#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1"
-#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu"
-#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G"
-
-#c) run it locally...
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-#export cuda_cmd=run.pl
-#export mkgraph_cmd=run.pl
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 4G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/fisher_swbd/s5/cmd.sh b/egs/fisher_swbd/s5/cmd.sh
index ca31c61d256..88db78823a5 100644
--- a/egs/fisher_swbd/s5/cmd.sh
+++ b/egs/fisher_swbd/s5/cmd.sh
@@ -1,19 +1,12 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
 # you can change cmd.sh depending on what type of queue you are using.
 # If you have no queueing system and want to run on a local machine, you
-# can change all instances 'queue.pl' to run.pl (but be careful and run 
+# can change all instances 'queue.pl' to run.pl (but be careful and run
 # commands one by one: most recipes will exhaust the memory on your
-# machine). queue.pl works with GridEngine (qsub). slurm.pl works
-# with slurm. Different queues are configured differently, with different
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
 # queue names and different ways of specifying things like memory;
 # to account for these differences you can create and edit the file
-# conf/queue.conf to match your queue's configuration. Search for
+# conf/queue.conf to match your queue's configuration.  Search for
 # conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
 # or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
 
diff --git a/egs/gale_arabic/s5/cmd.sh b/egs/gale_arabic/s5/cmd.sh
index 6e2777b595b..71dd849a93b 100755
--- a/egs/gale_arabic/s5/cmd.sh
+++ b/egs/gale_arabic/s5/cmd.sh
@@ -1,11 +1,15 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-export train_cmd="queue.pl -l 'arch=*64*'"
-export decode_cmd="queue.pl -l 'arch=*64*'"
-export cuda_cmd="queue.pl -l gpu=1"
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
 
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/gale_mandarin/s5/cmd.sh b/egs/gale_mandarin/s5/cmd.sh
index 6e2777b595b..2d51ad82004 100755
--- a/egs/gale_mandarin/s5/cmd.sh
+++ b/egs/gale_mandarin/s5/cmd.sh
@@ -1,11 +1,18 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-export train_cmd="queue.pl -l 'arch=*64*'"
-export decode_cmd="queue.pl -l 'arch=*64*'"
-export cuda_cmd="queue.pl -l gpu=1"
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
 
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
+# the use of cuda_cmd is deprecated, but it's still used in this example
+# directory.
+export cuda_cmd="queue.pl --gpu 1"
diff --git a/egs/hkust/s5/cmd.sh b/egs/hkust/s5/cmd.sh
index 2a46d89f385..71dd849a93b 100644
--- a/egs/hkust/s5/cmd.sh
+++ b/egs/hkust/s5/cmd.sh
@@ -1,13 +1,15 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-export train_cmd="queue.pl -q all.q@a*.clsp.jhu.edu"
-export decode_cmd="queue.pl -q all.q@a*.clsp.jhu.edu"
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
 
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/librispeech/s5/cmd.sh b/egs/librispeech/s5/cmd.sh
index 6395d96ca36..71dd849a93b 100644
--- a/egs/librispeech/s5/cmd.sh
+++ b/egs/librispeech/s5/cmd.sh
@@ -1,30 +1,15 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-#a) JHU cluster options
-export train_cmd="queue.pl -l arch=*64"
-export decode_cmd="queue.pl -l arch=*64 --mem 2G"
-export mkgraph_cmd="queue.pl -l arch=*64 --mem 4G"
-export big_memory_cmd="queue.pl -l arch=*64 --mem 8G"
-export cuda_cmd="queue.pl -l gpu=1"
-
-
-
-#b) BUT cluster options
-#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
-#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M"
-#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G"
-
-#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1"
-#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu"
-#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G"
-
-#c) run it locally...
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-#export cuda_cmd=run.pl
-#export mkgraph_cmd=run.pl
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/lre/v1/cmd.sh b/egs/lre/v1/cmd.sh
index 5c38b3a5d77..d1ca1a6d126 100644
--- a/egs/lre/v1/cmd.sh
+++ b/egs/lre/v1/cmd.sh
@@ -1,28 +1,15 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-#a) JHU cluster options
-export train_cmd="queue.pl -l arch=*64*"
-export decode_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G"
-#export cuda_cmd="..."
-export mkgraph_cmd="queue.pl -l arch=*64* ram_free=4G,mem_free=4G"
-
-#b) BUT cluster options
-#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
-#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M"
-#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G"
-#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1"
-#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu"
-#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G"
-
-#c) run it locally...
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-export cuda_cmd=run.pl
-#export mkgraph_cmd=run.pl
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 4G"
 
 
diff --git a/egs/lre07/v1/cmd.sh b/egs/lre07/v1/cmd.sh
index 5c38b3a5d77..d1ca1a6d126 100644
--- a/egs/lre07/v1/cmd.sh
+++ b/egs/lre07/v1/cmd.sh
@@ -1,28 +1,15 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-#a) JHU cluster options
-export train_cmd="queue.pl -l arch=*64*"
-export decode_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G"
-#export cuda_cmd="..."
-export mkgraph_cmd="queue.pl -l arch=*64* ram_free=4G,mem_free=4G"
-
-#b) BUT cluster options
-#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
-#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M"
-#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G"
-#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1"
-#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu"
-#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G"
-
-#c) run it locally...
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-export cuda_cmd=run.pl
-#export mkgraph_cmd=run.pl
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 4G"
 
 
diff --git a/egs/reverb/s5/cmd.sh b/egs/reverb/s5/cmd.sh
index e88b07e1195..71dd849a93b 100644
--- a/egs/reverb/s5/cmd.sh
+++ b/egs/reverb/s5/cmd.sh
@@ -1,29 +1,15 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-#a) JHU cluster options
-export train_cmd="queue.pl -l arch=*64,gpu=1 -q g.q"
-export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G"
-export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G"
-
-#export cuda_cmd="..."
-
-
-#b) BUT cluster options
-#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
-#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M"
-#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G"
-
-#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1"
-#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu"
-#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G"
-
-#c) run it locally...
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-#export cuda_cmd=run.pl
-#export mkgraph_cmd=run.pl
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/rm/s5/cmd.sh b/egs/rm/s5/cmd.sh
index 4d009813fd2..6e2f3e9ee48 100644
--- a/egs/rm/s5/cmd.sh
+++ b/egs/rm/s5/cmd.sh
@@ -1,23 +1,24 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
 
-train_cmd="queue.pl -l arch=*64"
-decode_cmd="queue.pl -l arch=*64"
+export train_cmd=queue.pl
+export decode_cmd=queue.pl
+export mkgraph_cmd=queue.pl
+export cuda_cmd="queue.pl --gpu 1"
 
-# cuda_cmd is used for nnet1 scripts e.g. local/run_dnn.sh, but
-# in the nnet2 scripts e.g. local/run_nnet2.sh, this is not
-# used and we append options to train_cmd.
-cuda_cmd="queue.pl -l arch=*64 -l gpu=1"
-
-#train_cmd="run.pl"
-# with run.pl we do training locally.  Note: for jobs on smallish subsets,
-# it's way faster to run on a single machine with a handful of CPUs, as
-# you avoid the latency of starting GridEngine jobs.
 
+# The rest of this file is here for historical reasons.  For cluster-specific
+# configuration it's generally better to use conf/queue.conf, see
+# http://kaldi-asr.org/doc/queue.html.
 
 # BUT cluster:
 if [ "$(hostname -d)" == "fit.vutbr.cz" ]; then
@@ -26,5 +27,5 @@ if [ "$(hostname -d)" == "fit.vutbr.cz" ]; then
   storage="matylda5"
   export train_cmd="queue.pl -q $queue -l ram_free=1500M,mem_free=1500M,${storage}=1"
   export decode_cmd="queue.pl -q $queue -l ram_free=2500M,mem_free=2500M,${storage}=0.5"
-  export cuda_cmd="queue.pl -q $gpu_queue -l gpu=1" 
+  export cuda_cmd="queue.pl -q $gpu_queue -l gpu=1"
 fi
diff --git a/egs/sprakbanken/s5/cmd.sh b/egs/sprakbanken/s5/cmd.sh
index 43867ccf0d9..71dd849a93b 100644
--- a/egs/sprakbanken/s5/cmd.sh
+++ b/egs/sprakbanken/s5/cmd.sh
@@ -1,30 +1,15 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-#a) JHU cluster options
-#export train_cmd="queue.pl -l arch=*64"
-#export decode_cmd="queue.pl -l arch=*64 --mem 2G"
-#export mkgraph_cmd="queue.pl -l arch=*64 --mem 2G"
-#export big_memory_cmd="queue.pl -l arch=*64 --mem 2G"
-#export cuda_cmd="queue.pl -l gpu=1"
-
-
-
-#b) BUT cluster options
-#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
-#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M"
-#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G"
-
-#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1"
-#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu"
-#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G"
-
-#c) run it locally...
-export train_cmd=run.pl
-export decode_cmd=run.pl
-export cuda_cmd=run.pl
-export mkgraph_cmd=run.pl
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/sre08/v1/cmd.sh b/egs/sre08/v1/cmd.sh
index 5c38b3a5d77..d1ca1a6d126 100644
--- a/egs/sre08/v1/cmd.sh
+++ b/egs/sre08/v1/cmd.sh
@@ -1,28 +1,15 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-#a) JHU cluster options
-export train_cmd="queue.pl -l arch=*64*"
-export decode_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G"
-#export cuda_cmd="..."
-export mkgraph_cmd="queue.pl -l arch=*64* ram_free=4G,mem_free=4G"
-
-#b) BUT cluster options
-#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
-#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M"
-#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G"
-#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1"
-#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu"
-#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G"
-
-#c) run it locally...
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-export cuda_cmd=run.pl
-#export mkgraph_cmd=run.pl
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 4G"
 
 
diff --git a/egs/sre10/v1/cmd.sh b/egs/sre10/v1/cmd.sh
index 5c38b3a5d77..d1ca1a6d126 100755
--- a/egs/sre10/v1/cmd.sh
+++ b/egs/sre10/v1/cmd.sh
@@ -1,28 +1,15 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-#a) JHU cluster options
-export train_cmd="queue.pl -l arch=*64*"
-export decode_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G"
-#export cuda_cmd="..."
-export mkgraph_cmd="queue.pl -l arch=*64* ram_free=4G,mem_free=4G"
-
-#b) BUT cluster options
-#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
-#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M"
-#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G"
-#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1"
-#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu"
-#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G"
-
-#c) run it locally...
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-export cuda_cmd=run.pl
-#export mkgraph_cmd=run.pl
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 4G"
 
 
diff --git a/egs/swbd/s5/cmd.sh b/egs/swbd/s5/cmd.sh
index 4abf8546b0d..bae7f5cdf45 100644
--- a/egs/swbd/s5/cmd.sh
+++ b/egs/swbd/s5/cmd.sh
@@ -1,28 +1,16 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-#a) JHU cluster options
-export train_cmd="queue.pl -l arch=*64*"
-export decode_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G"
-#export cuda_cmd="..."
-export mkgraph_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G"
-
-#b) BUT cluster options
-#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
-#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M"
-#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G"
-#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1"
-#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu"
-#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G"
-
-#c) run it locally...
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-export cuda_cmd=run.pl
-#export mkgraph_cmd=run.pl
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
 
+export train_cmd="queue.pl"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
 
diff --git a/egs/swbd/s5b/cmd.sh b/egs/swbd/s5b/cmd.sh
index 4abf8546b0d..575407ac0ff 100644
--- a/egs/swbd/s5b/cmd.sh
+++ b/egs/swbd/s5b/cmd.sh
@@ -1,28 +1,15 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-#a) JHU cluster options
-export train_cmd="queue.pl -l arch=*64*"
-export decode_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G"
-#export cuda_cmd="..."
-export mkgraph_cmd="queue.pl -l arch=*64* -l ram_free=4G,mem_free=4G"
-
-#b) BUT cluster options
-#export train_cmd="queue.pl -q all.q@@blade -l ram_free=1200M,mem_free=1200M"
-#export decode_cmd="queue.pl -q all.q@@blade -l ram_free=1700M,mem_free=1700M"
-#export decodebig_cmd="queue.pl -q all.q@@blade -l ram_free=4G,mem_free=4G"
-#export cuda_cmd="queue.pl -q long.q@@pco203 -l gpu=1"
-#export cuda_cmd="queue.pl -q long.q@pcspeech-gpu"
-#export mkgraph_cmd="queue.pl -q all.q@@servers -l ram_free=4G,mem_free=4G"
-
-#c) run it locally...
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-export cuda_cmd=run.pl
-#export mkgraph_cmd=run.pl
-
-
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/swbd/s5c/cmd.sh b/egs/swbd/s5c/cmd.sh
index 3dfaceaafab..c5a71711617 100644
--- a/egs/swbd/s5c/cmd.sh
+++ b/egs/swbd/s5c/cmd.sh
@@ -1,17 +1,23 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
 
-# Default opts,
-export train_cmd="queue.pl -l arch=*64*"
-export decode_cmd="queue.pl -l arch=*64* --mem 4G"
-export cuda_cmd=run.pl # Run on local machine,
-export mkgraph_cmd="queue.pl -l arch=*64* --mem 4G"
+export train_cmd="queue.pl"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
+export cuda_cmd="queue.pl --gpu 1"
 
-# BUT options,
+
+# the rest of this file is present for historical reasons.  it's better to
+# create and edit conf/queue.conf for cluster-specific configuration.
 if [ "$(hostname -d)" == "fit.vutbr.cz" ]; then
   # BUT cluster:
   queue="all.q@@blade,all.q@@speech"
@@ -20,5 +26,5 @@ if [ "$(hostname -d)" == "fit.vutbr.cz" ]; then
   export train_cmd="queue.pl -q $queue -l ram_free=1.5G,mem_free=1.5G,${storage}=0.25"
   export decode_cmd="queue.pl -q $queue -l ram_free=2.5G,mem_free=2.5G,${storage}=0.1"
   export cuda_cmd="queue.pl -q $gpu_queue -l gpu=1"
-fi 
+fi
 
diff --git a/egs/thchs30/s5/cmd.sh b/egs/thchs30/s5/cmd.sh
index 6d9fe9c0fb2..1d8e768790f 100644
--- a/egs/thchs30/s5/cmd.sh
+++ b/egs/thchs30/s5/cmd.sh
@@ -1,6 +1,6 @@
 # you can change cmd.sh depending on what type of queue you are using.
 # If you have no queueing system and want to run on a local machine, you
-# can change all instances 'queue.pl' to run.pl (but be careful and run 
+# can change all instances 'queue.pl' to run.pl (but be careful and run
 # commands one by one: most recipes will exhaust the memory on your
 # machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
 # with slurm.  Different queues are configured differently, with different
@@ -13,4 +13,3 @@
 export train_cmd=queue.pl
 export decode_cmd="queue.pl --mem 4G"
 export mkgraph_cmd="queue.pl --mem 8G"
-export cuda_cmd="$train_cmd --gpu 1"
diff --git a/egs/tidigits/s5/cmd.sh b/egs/tidigits/s5/cmd.sh
index c8f0d9d67a7..71dd849a93b 100644
--- a/egs/tidigits/s5/cmd.sh
+++ b/egs/tidigits/s5/cmd.sh
@@ -1,14 +1,15 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-#export train_cmd="queue.pl -q all.q@a*.clsp.jhu.edu"
-export decode_cmd="queue.pl -q all.q@a*.clsp.jhu.edu"
-export train_cmd=run.pl
-#export decode_cmd=run.pl
-
-
-
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/timit/s5/cmd.sh b/egs/timit/s5/cmd.sh
index 0150f486298..5abbfd4495a 100644
--- a/egs/timit/s5/cmd.sh
+++ b/egs/timit/s5/cmd.sh
@@ -1,36 +1,31 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
 
-#a) JHU cluster options
-#export train_cmd="queue.pl -l arch=*64"
-#export decode_cmd="queue.pl -l arch=*64,mem_free=2G,ram_free=2G"
-#export mkgraph_cmd="queue.pl -l arch=*64,ram_free=4G,mem_free=4G"
-#export cuda_cmd=run.pl
+export train_cmd="queue.pl --mem 4G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
+# the use of cuda_cmd is deprecated but it's still sometimes used in nnet1
+# example scripts.
+export cuda_cmd="queue.pl --gpu 1"
 
-
-if [ "$(hostname -d)" == "clsp.jhu.edu" ]; then
-  export train_cmd="queue.pl -l arch=*64*"
-  export decode_cmd="queue.pl -l arch=*64* --mem 3G"
-  export cuda_cmd="queue.pl -l gpu=1"
-elif [ "$(hostname -d)" == "fit.vutbr.cz" ]; then
+# the rest of this file is present for historical reasons.
+# for cluster-specific configuration it's better to rely on conf/queue.conf.
+if [ "$(hostname -d)" == "fit.vutbr.cz" ]; then
   #b) BUT cluster options
   queue="all.q@@blade,all.q@@speech"
   gpu_queue="long.q@@gpu"
   storage="matylda5"
   export train_cmd="queue.pl -q $queue -l ram_free=1.5G,mem_free=1.5G,${storage}=0.5"
   export decode_cmd="queue.pl -q $queue -l ram_free=2.5G,mem_free=2.5G,${storage}=0.1"
-  export cuda_cmd="queue.pl -q $gpu_queue -l gpu=1" 
-else
-  echo "$0: you need to define options for your cluster."
-  exit 1;
+  export cuda_cmd="queue.pl -q $gpu_queue -l gpu=1"
 fi
 
-#c) run locally...
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-#export cuda_cmd=run.pl
-#export mkgraph_cmd=run.pl
diff --git a/egs/voxforge/s5/cmd.sh b/egs/voxforge/s5/cmd.sh
index 2d454050669..71dd849a93b 100644
--- a/egs/voxforge/s5/cmd.sh
+++ b/egs/voxforge/s5/cmd.sh
@@ -1,14 +1,15 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
-
-#export train_cmd="queue.pl -q all.q@a*.clsp.jhu.edu"
-#export decode_cmd="queue.pl -q all.q@a*.clsp.jhu.edu"
-export train_cmd=run.pl
-export decode_cmd=run.pl
-
-
-
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
+
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
diff --git a/egs/vystadial_cz/s5/cmd.sh b/egs/vystadial_cz/s5/cmd.sh
index 0900744b5ae..bb0b5337cdb 100644
--- a/egs/vystadial_cz/s5/cmd.sh
+++ b/egs/vystadial_cz/s5/cmd.sh
@@ -1,22 +1,20 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
 
-#export train_cmd="queue.pl -q all.q@a*.clsp.jhu.edu"
-#export decode_cmd="queue.pl -q all.q@a*.clsp.jhu.edu"
-# export train_cmd="queue.pl -l mf=5g"
-# export decode_cmd="queue.pl -l mf=5g"
-export train_cmd="queue.pl -l arch=*64*"
-export decode_cmd="queue.pl -l arch=*64*"
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
 
-# The number of parallel jobs to be started for some parts of the recipe
-# Make sure you have enough resources(CPUs and RAM) to accomodate this number of jobs
-njobs=20
-
-# If you have no GridEngine you can do:
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-#njobs=2
+# this controls the number of parallel decoding jobs launched in run.sh if you
+# are running locally (e.g. with run.pl) you can reduce it to control memory
+# usage.
+export njobs=20
diff --git a/egs/vystadial_en/s5/cmd.sh b/egs/vystadial_en/s5/cmd.sh
index 0900744b5ae..bb0b5337cdb 100644
--- a/egs/vystadial_en/s5/cmd.sh
+++ b/egs/vystadial_en/s5/cmd.sh
@@ -1,22 +1,20 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
 
-#export train_cmd="queue.pl -q all.q@a*.clsp.jhu.edu"
-#export decode_cmd="queue.pl -q all.q@a*.clsp.jhu.edu"
-# export train_cmd="queue.pl -l mf=5g"
-# export decode_cmd="queue.pl -l mf=5g"
-export train_cmd="queue.pl -l arch=*64*"
-export decode_cmd="queue.pl -l arch=*64*"
+export train_cmd="queue.pl --mem 2G"
+export decode_cmd="queue.pl --mem 4G"
+export mkgraph_cmd="queue.pl --mem 8G"
 
-# The number of parallel jobs to be started for some parts of the recipe
-# Make sure you have enough resources(CPUs and RAM) to accomodate this number of jobs
-njobs=20
-
-# If you have no GridEngine you can do:
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-#njobs=2
+# this controls the number of parallel decoding jobs launched in run.sh if you
+# are running locally (e.g. with run.pl) you can reduce it to control memory
+# usage.
+export njobs=20
diff --git a/egs/wsj/s5/cmd.sh b/egs/wsj/s5/cmd.sh
index 96c48af42c1..537c46ba4f2 100644
--- a/egs/wsj/s5/cmd.sh
+++ b/egs/wsj/s5/cmd.sh
@@ -1,23 +1,23 @@
-# "queue.pl" uses qsub.  The options to it are
-# options to qsub.  If you have GridEngine installed,
-# change this to a queue you have access to.
-# Otherwise, use "run.pl", which will run jobs locally
-# (make sure your --num-jobs options are no more than
-# the number of cpus on your machine.
+# you can change cmd.sh depending on what type of queue you are using.
+# If you have no queueing system and want to run on a local machine, you
+# can change all instances 'queue.pl' to run.pl (but be careful and run
+# commands one by one: most recipes will exhaust the memory on your
+# machine).  queue.pl works with GridEngine (qsub).  slurm.pl works
+# with slurm.  Different queues are configured differently, with different
+# queue names and different ways of specifying things like memory;
+# to account for these differences you can create and edit the file
+# conf/queue.conf to match your queue's configuration.  Search for
+# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information,
+# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl.
 
-#a) JHU cluster options
-export train_cmd="queue.pl -l arch=*64"
-export decode_cmd="queue.pl -l arch=*64 --mem 2G"
-export mkgraph_cmd="queue.pl -l arch=*64 --mem 4G"
-export big_memory_cmd="queue.pl -l arch=*64 --mem 8G"
-export cuda_cmd="queue.pl -l gpu=1"
-
-#b) run it locally...
-#export train_cmd=run.pl
-#export decode_cmd=run.pl
-#export cuda_cmd=run.pl
-#export mkgraph_cmd=run.pl
+export train_cmd=queue.pl
+export decode_cmd="queue.pl --mem 2G"
+export mkgraph_cmd="queue.pl --mem 4G"
+# the use of cuda_cmd is deprecated.
+export cuda_cmd="queue.pl --gpu 1"
 
+# the rest of this file is present for historical reasons.
+# It's better to use conf/queue.conf for cluster-specific configuration.
 #c) BUT cluster:
 if [ "$(hostname -d)" == "fit.vutbr.cz" ]; then
   queue="all.q@@blade,all.q@@speech"
diff --git a/egs/wsj/s5/local/run_kl_hmm.sh b/egs/wsj/s5/local/run_kl_hmm.sh
index 9e7679a7675..efe95052c1d 100644
--- a/egs/wsj/s5/local/run_kl_hmm.sh
+++ b/egs/wsj/s5/local/run_kl_hmm.sh
@@ -5,6 +5,8 @@
 
 . cmd.sh
 
+big_memory_cmd="$decode_cmd --mem 8G"
+
 states=20000
 dir=exp/tri4b_pretrain-dbn_dnn/
 

From dbb028fb6188dcded7118b2650a8281d6a6fc4fe Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Fri, 19 Feb 2016 00:51:26 -0500
Subject: [PATCH 21/32] clarifying configuration process for CUDA, and give
 prototype Makefiles more meaningful names

---
 src/configure                                 | 38 ++++++++++---------
 .../{linux_cuda.mk => cuda_32bit.mk}          |  4 +-
 .../{linux_x86_64_cuda.mk => cuda_64bit.mk}   |  9 +----
 3 files changed, 23 insertions(+), 28 deletions(-)
 rename src/makefiles/{linux_cuda.mk => cuda_32bit.mk} (83%)
 rename src/makefiles/{linux_x86_64_cuda.mk => cuda_64bit.mk} (55%)

diff --git a/src/configure b/src/configure
index 2695859de84..0f6577dde17 100755
--- a/src/configure
+++ b/src/configure
@@ -403,11 +403,11 @@ function linux_configure_mkl_threading {
 }
 
 ##
-##CUDA is used in src/cudamatrix and src/nnet{,bin} only.
-##It is used to accelerate the neural network training,
-##the rest of kaldi is running on CPUs.
+## CUDA is used only in selected directories including src/cudamatrix, src/nnet*
+## and src/chain*.  It is used to accelerate the neural network training, the
+## rest of kaldi runs on CPUs.
 ##
-function linux_configure_cuda {
+function configure_cuda {
   #check for CUDA toolkit in the system
   if [ ! $CUDATKDIR ]; then
     for base in /Developer/NVIDIA/CUDA-6.0 /usr/local/share/cuda /usr/local/cuda /pkgs_local/cuda-3.2/ /opt/nvidia_cuda/cuda-6.0/ /usr/; do
@@ -428,9 +428,13 @@ function linux_configure_cuda {
     echo CUDATKDIR = $CUDATKDIR >> kaldi.mk
 
     if [ "`uname -m`" == "x86_64" ]; then
-      cat makefiles/linux_x86_64_cuda.mk >> kaldi.mk
+      if [ "`uname`" == "Darwin" ]; then
+        sed 's/lib64/lib/g' < makefiles/cuda_64bit.mk >> kaldi.mk
+      else
+        cat makefiles/cuda_64bit.mk >> kaldi.mk
+      fi
     else
-      cat makefiles/linux_cuda.mk >> kaldi.mk
+      cat makefiles/cuda_32bit.mk >> kaldi.mk
     fi
   else
     echo "CUDA will not be used! If you have already installed cuda drivers "
@@ -541,7 +545,7 @@ function linux_configure_debian_ubuntu {
   cat makefiles/linux_atlas.mk >> kaldi.mk
   fix_cxx_flag
   echo "Successfully configured for Debian/Ubuntu Linux [dynamic libraries] with ATLASLIBS =$ATLASLIBS"
-  $use_cuda && linux_configure_cuda
+  $use_cuda && configure_cuda
   linux_configure_speex
   exit_success;
 }
@@ -560,7 +564,7 @@ function linux_configure_debian_ubuntu3 {
   cat makefiles/linux_atlas.mk >> kaldi.mk
   fix_cxx_flag
   echo "Successfully configured for Debian/Ubuntu Linux [dynamic libraries] with ATLASLIBS =$ATLASLIBS"
-  $use_cuda && linux_configure_cuda
+  $use_cuda && configure_cuda
   linux_configure_speex
   exit_success;
 }
@@ -582,7 +586,7 @@ function linux_configure_debian7 {
   cat makefiles/linux_atlas.mk >> kaldi.mk
   fix_cxx_flag
   echo "Successfully configured for Debian 7 [dynamic libraries] with ATLASLIBS =$ATLASLIBS"
-  $use_cuda && linux_configure_cuda
+  $use_cuda && configure_cuda
   linux_configure_speex
   exit_success;
 }
@@ -601,7 +605,7 @@ function linux_configure_redhat {
   cat makefiles/linux_atlas.mk >> kaldi.mk
   fix_cxx_flag
   echo "Successfully configured for red hat [dynamic libraries] with ATLASLIBS =$ATLASLIBS"
-  $use_cuda && linux_configure_cuda
+  $use_cuda && configure_cuda
   exit_success;
 }
 
@@ -622,7 +626,7 @@ function linux_configure_redhat_fat {
   cat makefiles/linux_atlas.mk >> kaldi.mk
   fix_cxx_flag
   echo "Successfully configured for red hat [dynamic libraries, fat] with ATLASLIBS =$ATLASLIBS"
-  $use_cuda && linux_configure_cuda
+  $use_cuda && configure_cuda
   exit_success;
 }
 
@@ -674,7 +678,7 @@ function linux_configure_static {
   echo ATLASLIBS = $ATLASLIBS >> kaldi.mk
   cat makefiles/linux_atlas.mk >> kaldi.mk
   fix_cxx_flag
-  $use_cuda && linux_configure_cuda
+  $use_cuda && configure_cuda
   linux_configure_speex
   echo "Successfully configured for Linux [static libraries] with ATLASLIBS =$ATLASLIBS"
   exit_success;
@@ -753,7 +757,7 @@ function linux_configure_dynamic {
   echo ATLASLIBS = $ATLASLIBS >> kaldi.mk
   cat makefiles/linux_atlas.mk >> kaldi.mk
   fix_cxx_flag
-  $use_cuda && linux_configure_cuda
+  $use_cuda && configure_cuda
   linux_configure_speex
   echo "Successfully configured for Linux [dynamic libraries] with ATLASLIBS =$ATLASLIBS"
   exit_success;
@@ -813,7 +817,7 @@ echo "Doing OS specific configurations ..."
 # which crashes on Darwin. Also the linear algebra libraries on Macs are
 # used differently (through the Accelerate framework) than on Linux.
 if [ "`uname`" == "Darwin"  ]; then
- $use_cuda && linux_configure_cuda
+ $use_cuda && configure_cuda
   echo "On Darwin: checking for Accelerate framework ..."
   if [ ! -e /System/Library/Frameworks/Accelerate.framework ]; then
     failure "Need the Accelerate.framework to compile on Darwin."
@@ -973,7 +977,7 @@ if [ "`uname`" == "Linux" ]; then
     fix_cxx_flag
     echo "MKLFLAGS = ${MKL_LINK_LINE} ${THREADING_LINE} $EXTRA_LIBS " >> kaldi.mk
 
-    $use_cuda && linux_configure_cuda
+    $use_cuda && configure_cuda
     linux_configure_speex
     echo "Successfully configured for Linux with MKL libs from $MKLROOT"
     exit_success;
@@ -996,7 +1000,7 @@ if [ "`uname`" == "Linux" ]; then
     cat makefiles/linux_clapack.mk >> kaldi.mk
     fix_cxx_flag
     echo "Warning (CLAPACK): this part of the configure process is not properly tested and will not work."
-    $use_cuda && linux_configure_cuda
+    $use_cuda && configure_cuda
     linux_configure_speex
     echo "Successfully configured for Linux with CLAPACK libs from $CLAPACKROOT"
     exit_success;
@@ -1020,7 +1024,7 @@ if [ "`uname`" == "Linux" ]; then
     echo "OPENBLASROOT = $OPENBLASROOT" >> kaldi.mk
     cat makefiles/linux_openblas.mk >> kaldi.mk
     fix_cxx_flag
-    $use_cuda && linux_configure_cuda
+    $use_cuda && configure_cuda
     linux_configure_speex
     echo "Successfully configured OpenBLAS from $OPENBLASROOT."
     exit_success;
diff --git a/src/makefiles/linux_cuda.mk b/src/makefiles/cuda_32bit.mk
similarity index 83%
rename from src/makefiles/linux_cuda.mk
rename to src/makefiles/cuda_32bit.mk
index 502bf0ffc03..c89bf2e409d 100644
--- a/src/makefiles/linux_cuda.mk
+++ b/src/makefiles/cuda_32bit.mk
@@ -1,8 +1,6 @@
 
 CUDA_INCLUDE= -I$(CUDATKDIR)/include
 CUDA_FLAGS = -g -Xcompiler -fPIC --verbose --machine 32 -DHAVE_CUDA
-
-CXXFLAGS += -DHAVE_CUDA -I$(CUDATKDIR)/include 
+CXXFLAGS += -DHAVE_CUDA -I$(CUDATKDIR)/include
 LDFLAGS += -L$(CUDATKDIR)/lib -Wl,-rpath=$(CUDATKDIR)/lib
 LDLIBS += -lcublas -lcudart #LDLIBS : The libs are loaded later than static libs in implicit rule
-
diff --git a/src/makefiles/linux_x86_64_cuda.mk b/src/makefiles/cuda_64bit.mk
similarity index 55%
rename from src/makefiles/linux_x86_64_cuda.mk
rename to src/makefiles/cuda_64bit.mk
index 46613083188..25400f452f8 100644
--- a/src/makefiles/linux_x86_64_cuda.mk
+++ b/src/makefiles/cuda_64bit.mk
@@ -1,14 +1,7 @@
 
 CUDA_INCLUDE= -I$(CUDATKDIR)/include
 CUDA_FLAGS = -g -Xcompiler -fPIC --verbose --machine 64 -DHAVE_CUDA
-
-CXXFLAGS += -DHAVE_CUDA -I$(CUDATKDIR)/include 
-UNAME := $(shell uname)
-#aware of fact in cuda60 there is no lib64, just lib.
-ifeq ($(UNAME), Darwin)
-CUDA_LDFLAGS += -L$(CUDATKDIR)/lib -Wl,-rpath,$(CUDATKDIR)/lib
-else
+CXXFLAGS += -DHAVE_CUDA -I$(CUDATKDIR)/include
 CUDA_LDFLAGS += -L$(CUDATKDIR)/lib64 -Wl,-rpath,$(CUDATKDIR)/lib64
-endif
 CUDA_LDLIBS += -lcublas -lcudart #LDLIBS : The libs are loaded later than static libs in implicit rule
 

From 6b982f6fc1c2b87ba4fc608886a6b981a618b6d2 Mon Sep 17 00:00:00 2001
From: David Snyder <dsnyder@b01.clsp.jhu.edu>
Date: Fri, 19 Feb 2016 16:40:26 -0500
Subject: [PATCH 22/32] xvector: extending get_egs and related scripts for
 xvector training

---
 .../local/xvector/prepare_perturbed_data.sh   |   6 +-
 .../steps/nnet3/xvector/allocate_examples.py  |  64 ++++--
 egs/wsj/s5/steps/nnet3/xvector/get_egs.sh     | 217 +++++++-----------
 src/nnet3bin/nnet3-xvector-get-egs.cc         |  21 +-
 4 files changed, 137 insertions(+), 171 deletions(-)

diff --git a/egs/swbd/s5c/local/xvector/prepare_perturbed_data.sh b/egs/swbd/s5c/local/xvector/prepare_perturbed_data.sh
index ea863cb672b..7ce4d553733 100755
--- a/egs/swbd/s5c/local/xvector/prepare_perturbed_data.sh
+++ b/egs/swbd/s5c/local/xvector/prepare_perturbed_data.sh
@@ -21,11 +21,11 @@ if [ $stage -le 1 ]; then
     if [ -d data/${datadir}_sp ]; then
       echo "$0: directory ${datadir}_sp already exists, skipping creating it."
     else
-      utils/data/perturb_data_dir_speed_3way.sh ${datadir} ${datadir}_sp
-      utils/data/perturb_data_dir_volume.sh ${datadir}_sp
+      utils/data/perturb_data_dir_speed_3way.sh data/${datadir} data/${datadir}_sp
+      utils/data/perturb_data_dir_volume.sh data/${datadir}_sp
     fi
     if [ -f data/${datadir}_sp_hires/feats.scp ]; then
-      echo "$0: directory ${datadir}_sp_hires/feats.scp already exists, skipping creating it."
+      echo "$0: directory data/${datadir}_sp_hires/feats.scp already exists, skipping creating it."
     else
       mfccdir=mfcc
       utils/copy_data_dir.sh data/${datadir}_sp data/${datadir}_sp_hires
diff --git a/egs/wsj/s5/steps/nnet3/xvector/allocate_examples.py b/egs/wsj/s5/steps/nnet3/xvector/allocate_examples.py
index 24d6bdf217a..39e11f23b85 100755
--- a/egs/wsj/s5/steps/nnet3/xvector/allocate_examples.py
+++ b/egs/wsj/s5/steps/nnet3/xvector/allocate_examples.py
@@ -9,21 +9,21 @@
 #   --num-archives=169 --num-jobs=24  exp/xvector_a/egs/temp/utt2len.train exp/xvector_a/egs
 #
 # and this program outputs certain things to the temp directory (exp/xvector_a/egs/temp in this case)
-# that will enable you to dump the xvectors.  What we'll eventually be doing is invoking the following
-# program with something like the following args:
+# that will enable you to dump the chunks for xvector trainign.  What we'll eventually be doing is invoking
+# the following program with something like the following args:
 #
-#  nnet3-xvector-get-egs1 [options] exp/xvector_a/temp/ranges.1  scp:data/train/feats.scp \
+#  nnet3-xvector-get-egs [options] exp/xvector_a/temp/ranges.1  scp:data/train/feats.scp \
 #    ark:exp/xvector_a/egs/egs_temp.1.ark ark:exp/xvector_a/egs/egs_temp.2.ark \
 #    ark:exp/xvector_a/egs/egs_temp.3.ark
 #
 # where exp/xvector_a/temp/ranges.1 contains something like the following:
 #
-#   utt1  3   0   65  112  110
-#   utt1  0   160 50  214  180
+#   utt1  3  0  0   65  112  110
+#   utt1  0  2  160 50  214  180
 #   utt2  ...
 #
 # where each line is interpreted as follows:
-#  <source-utterance> <output-archive-index>  <start-frame-index1> <num-frames1> <start-frame-index2> <num-frames2>
+#  <source-utterance> <relative-archive-index> <ouput-archive-index> <start-frame-index1> <num-frames1> <start-frame-index2> <num-frames2>
 # and for each line we create an eg (containing two possibly-different-length chunks of data from the
 # same utterance), to one of the output archives.  The list of archives corresponding to
 # ranges.n will be written to output.n, so in exp/xvector_a/temp/outputs.1 we'd have:
@@ -52,10 +52,18 @@
 parser = argparse.ArgumentParser(description="Writes ranges.*, outputs.* and archive_chunk_lengths files "
                                  "in preparation for dumping egs for xvector training.",
                                  epilog="Called by steps/nnet3/xvector/get_egs.sh")
+parser.add_argument("--prefix", type=str, default="",
+                   help="Adds a prefix to the output files. This is used to distinguish between the train "
+                   "and diagnostic files.")
 parser.add_argument("--min-frames-per-chunk", type=int, default=50,
                     help="Minimum number of frames-per-chunk used for any archive")
 parser.add_argument("--max-frames-per-chunk", type=int, default=300,
                     help="Maximum number of frames-per-chunk used for any archive")
+parser.add_argument("--randomize-chunk-length", type=str,
+                    help="If true, randomly pick a chunk length in [min-frames-per-chunk, max-frames-per-chunk]."
+                    "If false, the chunk length varies from min-frames-per-chunk to max-frames-per-chunk"
+                    "according to a geometric sequence.",
+                    default="true", choices = ["false", "true"])
 parser.add_argument("--frames-per-iter", type=int, default=1000000,
                     help="Target number of frames for each archive")
 parser.add_argument("--num-archives", type=int, default=-1,
@@ -137,6 +145,18 @@ def RandomChunkLength():
     ans = int(math.exp(log_value) + 0.45)
     return ans
 
+# This function returns an integer in the range
+# [min-frames-per-chunk, max-frames-per-chunk] according to a geometric
+# sequence. For example, suppose min-frames-per-chunk is 50,
+# max-frames-per-chunk is 200, and args.num_archives is 3. Then the
+# lengths for archives 0, 1, and 2 will be 50, 100, and 200.
+def DeterministicChunkLength(archive_id):
+  ans = int(math.pow(float(args.max_frames_per_chunk) /
+                     args.min_frames_per_chunk, float(archive_id) /
+                     (args.num_archives-1)) * args.min_frames_per_chunk + 0.5)
+  return ans
+
+
 
 # given an utterance length utt_length (in frames) and two desired chunk lengths
 # (length1 and length2) whose sum is <= utt_length,
@@ -180,14 +200,21 @@ def GetRandomOffsets(utt_length, length1, length2):
 # an array of 3-tuples (utterance-index, offset1, offset2)
 all_egs= []
 
-info_f = open(args.egs_dir + "/temp/archive_chunk_lengths", "w")
-if info_f is None:
-    sys.exit("Error opening file {0}/temp/archive_chunk_lengths".format(args.egs_dir));
+prefix = ""
+if args.prefix != "":
+  prefix = args.prefix + "_"
 
+info_f = open(args.egs_dir + "/temp/" + prefix + "archive_chunk_lengths", "w")
+if info_f is None:
+    sys.exit(str("Error opening file {0}/temp/" + prefix + "archive_chunk_lengths").format(args.egs_dir));
 for archive_index in range(args.num_archives):
     print("Processing archive {0}".format(archive_index + 1))
-    length1 = RandomChunkLength();
-    length2 = RandomChunkLength();
+    if args.randomize_chunk_length == "true":
+      length1 = RandomChunkLength();
+      length2 = length1
+    else:
+      length1 = DeterministicChunkLength(archive_index);
+      length2 = length1
     print("{0} {1} {2}".format(archive_index + 1, length1, length2), file=info_f)
     archive_chunk_lengths.append( (length1, length2) )
     tot_length = length1 + length2
@@ -218,12 +245,13 @@ def GetRandomOffsets(utt_length, length1, length2):
         for (utterance_index, offset1, offset2) in all_egs[cur_archive]:
             this_ranges.append( (utterance_index, i, offset1, offset2) )
         cur_archive = cur_archive + 1
-    f = open(args.egs_dir + "/temp/ranges." + str(job + 1), "w")
+    f = open(args.egs_dir + "/temp/" + prefix + "ranges." + str(job + 1), "w")
     if f is None:
-        sys.exit("Error opening file " + args.egs_dir + "/temp/ranges." + str(job + 1))
+        sys.exit("Error opening file " + args.egs_dir + "/temp/" + prefix + "ranges." + str(job + 1))
     for (utterance_index, i, offset1, offset2) in sorted(this_ranges):
         archive_index = this_archives_for_job[i]
-        print("{0} {1} {2} {3} {4}".format(utt_ids[utterance_index],
+        print("{0} {1} {2} {3} {4} {5} {6}".format(utt_ids[utterance_index],
+                                           i,
                                            archive_index + 1,
                                            offset1,
                                            archive_chunk_lengths[archive_index][0],
@@ -232,13 +260,13 @@ def GetRandomOffsets(utt_length, length1, length2):
               file=f)
     f.close()
 
-    f = open(args.egs_dir + "/temp/outputs." + str(job + 1), "w")
+    f = open(args.egs_dir + "/temp/" + prefix + "outputs." + str(job + 1), "w")
     if f is None:
-        sys.exit("Error opening file " + args.egs_dir + "/temp/outputs." + str(job + 1))
-    print( " ".join([ "{0}/egs_temp.{1}.ark".format(args.egs_dir, n + 1) for n in this_archives_for_job ]),
+        sys.exit("Error opening file " + args.egs_dir + "/temp/" + prefix + "outputs." + str(job + 1))
+    print( " ".join([ str("{0}/" + prefix + "egs_temp.{1}.ark").format(args.egs_dir, n + 1) for n in this_archives_for_job ]),
            file=f)
     f.close()
 
 
-print("allocate_examples.py: finished generating ranges.* and outputs.* files")
+print("allocate_examples.py: finished generating " + prefix + "ranges.* and " + prefix + "outputs.* files")
 
diff --git a/egs/wsj/s5/steps/nnet3/xvector/get_egs.sh b/egs/wsj/s5/steps/nnet3/xvector/get_egs.sh
index 4b0d558bc09..2ab81395d47 100755
--- a/egs/wsj/s5/steps/nnet3/xvector/get_egs.sh
+++ b/egs/wsj/s5/steps/nnet3/xvector/get_egs.sh
@@ -1,6 +1,8 @@
 #!/bin/bash
 
-# Copyright 2012-2015 Johns Hopkins University (Author: Daniel Povey).  Apache 2.0.
+# Copyright 2012-2016 Johns Hopkins University (Author: Daniel Povey)
+#                2016 David Snyder
+# Apache 2.0
 #
 # This script dumps training examples (egs) for xvector training.  These egs
 # have only an input and no outputs (the inputs are typically MFCCs).  The egs
@@ -15,12 +17,6 @@
 # This script, which will generally be called from other neural-net training
 # scripts, extracts the training examples used to train the neural net (and also
 # the validation examples used for diagnostics), and puts them in separate archives.
-#
-# This script dumps egs with several frames of labels, controlled by the
-# frames_per_eg config variable (default: 8).  This takes many times less disk
-# space because typically we have 4 to 7 frames of context on the left and
-# right, and this ends up getting shared.  This is at the expense of slightly
-# higher disk I/O while training.
 
 
 # Begin configuration section.
@@ -94,6 +90,9 @@ if [ ! -f $data/feats.scp ]; then
   exit 1
 fi
 
+sdata=$data/split$nj
+utils/split_data.sh $data $nj
+
 if [ ! -f $data/utt2dur ]; then
   # getting this utt2dur will normally be more lightweight than
   # getting the exact utterance-to-length map.
@@ -120,28 +119,42 @@ if [ $stage -le 1 ]; then
   echo "$0: getting list of validation utterances"
 
 # Get list of validation utterances.
-  awk '{print $1}' $data/utt2spk | utils/shuffle_list.pl | head -$num_utts_subset \
+  awk '{print $1}' $data/utt2spk | utils/shuffle_list.pl | head -$num_heldout_utts \
     > $temp/valid_uttlist || exit 1;
 
+  awk '{print $1}' $data/utt2spk | utils/filter_scp.pl --exclude $temp/valid_uttlist \
+    | utils/shuffle_list.pl | head -$num_heldout_utts > $temp/train_subset_uttlist || exit 1;
+
   if [ -f $data/utt2uniq ]; then  # this matters if you use data augmentation.
-    echo "File $data/utt2uniq exists, so augmenting valid_uttlist to"
-    echo "include all perturbed versions of the same 'real' utterances."
-    mv $temp/valid_uttlist $temp/valid_uttlist.tmp
     utils/utt2spk_to_spk2utt.pl $data/utt2uniq > $temp/uniq2utt
-    cat $temp/valid_uttlist.tmp | utils/apply_map.pl $data/utt2uniq | \
-      sort | uniq | utils/apply_map.pl $temp/uniq2utt | \
-      awk '{for(n=1;n<=NF;n++) print $n;}' | sort  > $temp/valid_uttlist
-    rm $temp/uniq2utt $temp/valid_uttlist.tmp
+    for uttlist in valid_uttlist train_subset_uttlist; do
+      echo "File $data/utt2uniq exists, so augmenting $uttlist to"
+      echo "include all perturbed versions of the same 'real' utterances."
+      mv $temp/$uttlist $temp/${uttlist}.tmp
+      cat $temp/$uttlist.tmp | utils/apply_map.pl $data/utt2uniq | \
+        sort | uniq | utils/apply_map.pl $temp/uniq2utt | \
+        awk '{for(n=1;n<=NF;n++) print $n;}' | sort  > $temp/$uttlist
+    done
+    rm $temp/uniq2utt $temp/$uttlist.tmp
   fi
+
+  awk '{print $1}' $temp/utt2len |
   utils/filter_scp.pl --exclude $temp/valid_uttlist <$temp/utt2len > $temp/utt2len.train
   utils/filter_scp.pl $temp/valid_uttlist <$temp/utt2len > $temp/utt2len.valid
+  utils/filter_scp.pl $temp/train_subset_uttlist <$temp/utt2len > $temp/utt2len.train_subset
 fi
 
+# TODO: Currently just supporting raw features
+feats="scp,s,cs:utils/filter_scp.pl $temp/ranges.JOB $data/feats.scp |"
+valid_feats="scp,s,cs:utils/filter_scp.pl $temp/valid_uttlist $data/feats.scp |"
+train_subset_feats="scp,s,cs:utils/filter_scp.pl $temp/train_subset_uttlist $data/feats.scp |"
+
 
 # first for the training data... work out how many archives.
 
 num_train_frames=$(awk '{n += $2} END{print n}' <$temp/utt2len.train)
 num_valid_frames=$(awk '{n += $2} END{print n}' <$temp/utt2len.valid)
+num_train_subset_frames=$(awk '{n += $2} END{print n}' <$temp/utt2len.train_subset)
 
 echo $num_train_frames >$dir/info/num_frames
 
@@ -166,147 +179,71 @@ if [ $stage -le 2 ]; then
 fi
 
 if [ $stage -le 3 ]; then
-  echo "$0: allocating examples"
-  $cmd $dir/log/allocate_examples.log \
+  echo "$0: allocating training examples"
+  $cmd $dir/log/allocate_examples_train.log \
     steps/nnet3/xvector/allocate_examples.py \
       --min-frames-per-chunk=$min_frames_per_chunk \
       --max-frames-per-chunk=$max_frames_per_chunk \
       --frames-per-iter=$frames_per_iter \
       --num-archives=$num_train_archives --num-jobs=$nj \
       $dir/temp/utt2len.train $dir  || exit 1
-fi
-
-# HERE - todo.
-
-exit 0
-
 
+  echo "$0: allocating training subset examples"
+  $cmd $dir/log/allocate_examples_train_subset.log \
+    steps/nnet3/xvector/allocate_examples.py \
+      --prefix train_subset \
+      --min-frames-per-chunk=$min_frames_per_chunk \
+      --max-frames-per-chunk=$max_frames_per_chunk \
+      --randomize-chunk-length false \
+      --frames-per-iter=$frames_per_iter_diagnostic \
+      --num-archives=$num_diagnostic_archives --num-jobs=1 \
+      $dir/temp/utt2len.train_subset $dir  || exit 1
 
-
-if [ $stage -le 2 ]; then
-  echo "$0: copying data alignments"
-  for id in $(seq $num_ali_jobs); do gunzip -c $alidir/ali.$id.gz; done | \
-    copy-int-vector ark:- ark,scp:$dir/ali.ark,$dir/ali.scp || exit 1;
-fi
-
-egs_opts="--left-context=$left_context --right-context=$right_context --compress=$compress"
-
-[ -z $valid_left_context ] &&  valid_left_context=$left_context;
-[ -z $valid_right_context ] &&  valid_right_context=$right_context;
-valid_egs_opts="--left-context=$valid_left_context --right-context=$valid_right_context --compress=$compress"
-
-echo $left_context > $dir/info/left_context
-echo $right_context > $dir/info/right_context
-num_pdfs=$(tree-info --print-args=false $alidir/tree | grep num-pdfs | awk '{print $2}')
-if [ $stage -le 3 ]; then
-  echo "$0: Getting validation and training subset examples."
-  rm $dir/.error 2>/dev/null
-  echo "$0: ... extracting validation and training-subset alignments."
-
-  utils/filter_scp.pl <(cat $dir/valid_uttlist $dir/train_subset_uttlist) \
-    <$dir/ali.scp >$dir/ali_special.scp
-
-  $cmd $dir/log/create_valid_subset.log \
-    nnet3-get-egs --num-pdfs=$num_pdfs $valid_ivector_opt $valid_egs_opts "$valid_feats" \
-    "ark,s,cs:ali-to-pdf $alidir/final.mdl scp:$dir/ali_special.scp ark:- | ali-to-post ark:- ark:- |" \
-    "ark:$dir/valid_all.egs" || touch $dir/.error &
-  $cmd $dir/log/create_train_subset.log \
-    nnet3-get-egs --num-pdfs=$num_pdfs $train_subset_ivector_opt $valid_egs_opts "$train_subset_feats" \
-     "ark,s,cs:ali-to-pdf $alidir/final.mdl scp:$dir/ali_special.scp ark:- | ali-to-post ark:- ark:- |" \
-     "ark:$dir/train_subset_all.egs" || touch $dir/.error &
-  wait;
-  [ -f $dir/.error ] && echo "Error detected while creating train/valid egs" && exit 1
-  echo "... Getting subsets of validation examples for diagnostics and combination."
-  $cmd $dir/log/create_valid_subset_combine.log \
-    nnet3-subset-egs --n=$num_valid_frames_combine ark:$dir/valid_all.egs \
-    ark:$dir/valid_combine.egs || touch $dir/.error &
-  $cmd $dir/log/create_valid_subset_diagnostic.log \
-    nnet3-subset-egs --n=$num_frames_diagnostic ark:$dir/valid_all.egs \
-    ark:$dir/valid_diagnostic.egs || touch $dir/.error &
-
-  $cmd $dir/log/create_train_subset_combine.log \
-    nnet3-subset-egs --n=$num_train_frames_combine ark:$dir/train_subset_all.egs \
-    ark:$dir/train_combine.egs || touch $dir/.error &
-  $cmd $dir/log/create_train_subset_diagnostic.log \
-    nnet3-subset-egs --n=$num_frames_diagnostic ark:$dir/train_subset_all.egs \
-    ark:$dir/train_diagnostic.egs || touch $dir/.error &
-  wait
-  sleep 5  # wait for file system to sync.
-  cat $dir/valid_combine.egs $dir/train_combine.egs > $dir/combine.egs
-
-  for f in $dir/{combine,train_diagnostic,valid_diagnostic}.egs; do
-    [ ! -s $f ] && echo "No examples in file $f" && exit 1;
-  done
-  rm $dir/valid_all.egs $dir/train_subset_all.egs $dir/{train,valid}_combine.egs
+  echo "$0: allocating validation examples"
+  $cmd $dir/log/allocate_examples_valid.log \
+    steps/nnet3/xvector/allocate_examples.py \
+      --prefix valid \
+      --min-frames-per-chunk=$min_frames_per_chunk \
+      --max-frames-per-chunk=$max_frames_per_chunk \
+      --randomize-chunk-length false \
+      --frames-per-iter=$frames_per_iter_diagnostic \
+      --frames-per-iter=$frames_per_iter_diagnostic \
+      --num-archives=$num_diagnostic_archives --num-jobs=1 \
+      $dir/temp/utt2len.valid $dir  || exit 1
 fi
 
 if [ $stage -le 4 ]; then
-  # create egs_orig.*.*.ark; the first index goes to $nj,
-  # the second to $num_archives_intermediate.
-
-  egs_list=
-  for n in $(seq $num_archives_intermediate); do
-    egs_list="$egs_list ark:$dir/egs_orig.JOB.$n.ark"
-  done
   echo "$0: Generating training examples on disk"
-  # The examples will go round-robin to egs_list.
-  $cmd JOB=1:$nj $dir/log/get_egs.JOB.log \
-    nnet3-get-egs --num-pdfs=$num_pdfs $ivector_opt $egs_opts --num-frames=$frames_per_eg "$feats" \
-    "ark,s,cs:filter_scp.pl $sdata/JOB/utt2spk $dir/ali.scp | ali-to-pdf $alidir/final.mdl scp:- ark:- | ali-to-post ark:- ark:- |" ark:- \| \
-    nnet3-copy-egs --random=true --srand=JOB ark:- $egs_list || exit 1;
+  for g in $(seq $nj); do
+    outputs=`awk '{for(i=1;i<=NF;i++)printf("ark:%s ",$i);}' $temp/outputs.$g`
+    $cmd $dir/log/train_create_examples.$g.log \
+      nnet3-xvector-get-egs $temp/ranges.$g \
+      "`echo $feats | sed s/JOB/$g/g`" $outputs || exit 1 &
+  done
+  wait
+  train_subset_outputs=`awk '{for(i=1;i<=NF;i++)printf("ark:%s ",$i);}' $temp/train_subset_outputs.1`
+  echo "$0: Generating training subset examples on disk"
+  $cmd $dir/log/train_subset_create_examples.1.log \
+    nnet3-xvector-get-egs $temp/train_subset_ranges.1 \
+    "$train_subset_feats" $train_subset_outputs || exit 1
+  valid_outputs=`awk '{for(i=1;i<=NF;i++)printf("ark:%s ",$i);}' $temp/valid_outputs.1`
+  echo "$0: Generating validation examples on disk"
+  $cmd $dir/log/valid_create_examples.1.log \
+    nnet3-xvector-get-egs $temp/valid_ranges.1 \
+    "$valid_feats" $valid_outputs || exit 1
 fi
 
 if [ $stage -le 5 ]; then
-  echo "$0: recombining and shuffling order of archives on disk"
-  # combine all the "egs_orig.*.JOB.scp" (over the $nj splits of the data) and
-  # shuffle the order, writing to the egs.JOB.ark
-
-  # the input is a concatenation over the input jobs.
-  egs_list=
-  for n in $(seq $nj); do
-    egs_list="$egs_list $dir/egs_orig.$n.JOB.ark"
-  done
-
-  if [ $archives_multiple == 1 ]; then # normal case.
-    $cmd --max-jobs-run $nj JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \
-      nnet3-shuffle-egs --srand=JOB "ark:cat $egs_list|" ark:$dir/egs.JOB.ark  || exit 1;
-  else
-    # we need to shuffle the 'intermediate archives' and then split into the
-    # final archives.  we create soft links to manage this splitting, because
-    # otherwise managing the output names is quite difficult (and we don't want
-    # to submit separate queue jobs for each intermediate archive, because then
-    # the --max-jobs-run option is hard to enforce).
-    output_archives="$(for y in $(seq $archives_multiple); do echo ark:$dir/egs.JOB.$y.ark; done)"
-    for x in $(seq $num_archives_intermediate); do
-      for y in $(seq $archives_multiple); do
-        archive_index=$[($x-1)*$archives_multiple+$y]
-        # egs.intermediate_archive.{1,2,...}.ark will point to egs.archive.ark
-        ln -sf egs.$archive_index.ark $dir/egs.$x.$y.ark || exit 1
-      done
-    done
-    $cmd --max-jobs-run $nj JOB=1:$num_archives_intermediate $dir/log/shuffle.JOB.log \
-      nnet3-shuffle-egs --srand=JOB "ark:cat $egs_list|" ark:- \| \
-      nnet3-copy-egs ark:- $output_archives || exit 1;
-  fi
-
+  echo "$0: Shuffling order of archives on disk"
+  $cmd --max-jobs-run $nj JOB=1:$num_train_archives $dir/log/shuffle.JOB.log \
+    nnet3-shuffle-egs --srand=JOB ark:$dir/egs_temp.JOB.ark ark:$dir/egs.JOB.ark  || exit 1;
+
+  $cmd --max-jobs-run $nj JOB=1:$num_diagnostic_archives $dir/log/train_subset_shuffle.JOB.log \
+    nnet3-shuffle-egs --srand=JOB ark:$dir/train_subset_egs_temp.JOB.ark ark:$dir/train_diagnostic_egs.JOB.ark  || exit 1;
+  $cmd --max-jobs-run $nj JOB=1:$num_diagnostic_archives $dir/log/valid_shuffle.JOB.log \
+    nnet3-shuffle-egs --srand=JOB ark:$dir/valid_egs_temp.JOB.ark ark:$dir/valid_diagnostic_egs.JOB.ark  || exit 1;
 fi
 
-if [ $stage -le 6 ]; then
-  echo "$0: removing temporary archives"
-  for x in $(seq $nj); do
-    for y in $(seq $num_archives_intermediate); do
-      file=$dir/egs_orig.$x.$y.ark
-      [ -L $file ] && rm $(readlink -f $file)
-      rm $file
-    done
-  done
-  if [ $archives_multiple -gt 1 ]; then
-    # there are some extra soft links that we should delete.
-    for f in $dir/egs.*.*.ark; do rm $f; done
-  fi
-  echo "$0: removing temporary alignments and transforms"
-  # Ignore errors below because trans.* might not exist.
-  rm $dir/{ali,trans}.{ark,scp} 2>/dev/null
-fi
+#TODO: Probably need to cleanup the temp egs.
 
 echo "$0: Finished preparing training examples"
diff --git a/src/nnet3bin/nnet3-xvector-get-egs.cc b/src/nnet3bin/nnet3-xvector-get-egs.cc
index 24e50560b54..55ba475c0fe 100644
--- a/src/nnet3bin/nnet3-xvector-get-egs.cc
+++ b/src/nnet3bin/nnet3-xvector-get-egs.cc
@@ -49,15 +49,15 @@ static void ProcessRangeFile(const std::string &range_rxfilename,
       ChunkPairInfo *pair = new ChunkPairInfo();
       std::vector<std::string> fields;
       SplitStringToVector(line, " \t\n\r", true, &fields);
-      if (fields.size() != 6)
-        KALDI_ERR << "Expected 6 fields in line of range file, got "
+      if (fields.size() != 7)
+        KALDI_ERR << "Expected 7 fields in line of range file, got "
                   << fields.size() << " instead.";
 
       std::string utt = fields[0],
-                  start_frame1_str = fields[2],
-                  num_frames1_str = fields[3],
-                  start_frame2_str = fields[4],
-                  num_frames2_str = fields[5];
+                  start_frame1_str = fields[3],
+                  num_frames1_str = fields[4],
+                  start_frame2_str = fields[5],
+                  num_frames2_str = fields[6];
 
       if (!ConvertStringToInteger(fields[1], &(pair->output_archive_id))
           || !ConvertStringToInteger(start_frame1_str, &(pair->start_frame1))
@@ -166,11 +166,12 @@ int main(int argc, char *argv[]) {
         "the same utterance.  The location and length of the feature chunks\n"
         "are specified in the 'ranges' file.  Each line is interpreted as\n"
         "follows:\n"
-        "  <source-utterance> <output-archive-index>  <start-frame-index1>"
-        " <num-frames1> <start-frame-index2> <num-frames2>\n"
+        "  <source-utterance> <relative-output-archive-index> "
+        "<output-archive-index>  <start-frame-index1> <num-frames1> "
+        "<start-frame-index2> <num-frames2>\n"
         "For example:\n"
-        "  utt1  3   0   65  112  110\n"
-        "  utt1  0   160 50  214  180\n"
+        "  utt1  3  13  0   65  112  110\n"
+        "  utt1  0  10  160 50  214  180\n"
         "  utt2  ...\n"
         "\n"
         "Usage:  nnet3-xvector-get-egs [options] <ranges-filename> "

From 96413f5c027cff4cfc2912e41e7935940f4a497d Mon Sep 17 00:00:00 2001
From: David Snyder <dsnyder@b01.clsp.jhu.edu>
Date: Fri, 19 Feb 2016 16:47:58 -0500
Subject: [PATCH 23/32] xvector: fixing typo

---
 egs/wsj/s5/steps/nnet3/xvector/allocate_examples.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/egs/wsj/s5/steps/nnet3/xvector/allocate_examples.py b/egs/wsj/s5/steps/nnet3/xvector/allocate_examples.py
index 39e11f23b85..8ef0ded7c15 100755
--- a/egs/wsj/s5/steps/nnet3/xvector/allocate_examples.py
+++ b/egs/wsj/s5/steps/nnet3/xvector/allocate_examples.py
@@ -9,7 +9,7 @@
 #   --num-archives=169 --num-jobs=24  exp/xvector_a/egs/temp/utt2len.train exp/xvector_a/egs
 #
 # and this program outputs certain things to the temp directory (exp/xvector_a/egs/temp in this case)
-# that will enable you to dump the chunks for xvector trainign.  What we'll eventually be doing is invoking
+# that will enable you to dump the chunks for xvector training.  What we'll eventually be doing is invoking
 # the following program with something like the following args:
 #
 #  nnet3-xvector-get-egs [options] exp/xvector_a/temp/ranges.1  scp:data/train/feats.scp \

From 90af624290c987ddbd89dee48c0462093c62cafd Mon Sep 17 00:00:00 2001
From: Dan Povey <dpovey@gmail.com>
Date: Fri, 19 Feb 2016 17:24:38 -0500
Subject: [PATCH 24/32] chain branch: bug-fix in self-repair code for sigmoid
 units

---
 src/nnet3/nnet-simple-component.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/nnet3/nnet-simple-component.cc b/src/nnet3/nnet-simple-component.cc
index aadd0c05a1d..28fa24b4fae 100644
--- a/src/nnet3/nnet-simple-component.cc
+++ b/src/nnet3/nnet-simple-component.cc
@@ -510,7 +510,7 @@ void SigmoidComponent::RepairGradients(
 
   in_deriv->AddMatDiagVec(-2.0 * self_repair_scale_ / repair_probability,
                           out_value, kNoTrans, thresholds_vec);
-  in_deriv->AddVecToCols(self_repair_scale_ / repair_probability,
+  in_deriv->AddVecToRows(self_repair_scale_ / repair_probability,
                          thresholds_vec);
 }
 

From e980e75778eea797ed96e3583d3ac11c713d2175 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Fri, 19 Feb 2016 19:25:19 -0500
Subject: [PATCH 25/32] small cosmetic change to RM example script for chain
 models

---
 egs/rm/s5/local/chain/run_tdnn_5f.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/egs/rm/s5/local/chain/run_tdnn_5f.sh b/egs/rm/s5/local/chain/run_tdnn_5f.sh
index 3cd46707ef3..0379d16fe13 100644
--- a/egs/rm/s5/local/chain/run_tdnn_5f.sh
+++ b/egs/rm/s5/local/chain/run_tdnn_5f.sh
@@ -52,7 +52,7 @@ if [ $stage -le 4 ]; then
   # Get the alignments as lattices (gives the CTC training more freedom).
   # use the same num-jobs as the alignments
   nj=$(cat exp/tri3b_ali/num_jobs) || exit 1;
-  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd -l q=all.q" data/train \
+  steps/align_fmllr_lats.sh --nj $nj --cmd "$train_cmd" data/train \
     data/lang exp/tri3b exp/tri3b_lats
   rm exp/tri3b_lats/fsts.*.gz # save space
 fi
@@ -74,7 +74,7 @@ if [ $stage -le 6 ]; then
   # Build a tree using our new topology.
   steps/nnet3/chain/build_tree.sh --frame-subsampling-factor 3 \
     --leftmost-questions-truncate $leftmost_questions_truncate \
-    --cmd "$train_cmd -l q=all.q" 1200 data/train $lang $ali_dir $treedir
+    --cmd "$train_cmd" 1200 data/train $lang $ali_dir $treedir
 fi
 
 if [ $stage -le 7 ]; then

From 0ca297f1239c11a19728694d2586b574250cca90 Mon Sep 17 00:00:00 2001
From: David Snyder <dsnyder@b01.clsp.jhu.edu>
Date: Sat, 20 Feb 2016 13:59:45 -0500
Subject: [PATCH 26/32] xvector: fixing makefiles and paths for xvector code

---
 egs/swbd/s5c/path.sh                   | 2 +-
 src/xvector/nnet-xvector-diagnostics.h | 6 +++---
 src/xvector/nnet-xvector-training.h    | 6 +++---
 src/xvector/xvector.h                  | 4 ++--
 src/xvectorbin/Makefile                | 5 +++--
 5 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/egs/swbd/s5c/path.sh b/egs/swbd/s5c/path.sh
index c6b8450c86a..a07adf42589 100755
--- a/egs/swbd/s5c/path.sh
+++ b/egs/swbd/s5c/path.sh
@@ -1,4 +1,4 @@
 export KALDI_ROOT=`pwd`/../../..
-export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin:$KALDI_ROOT/src/nnet3bin:$KALDI_ROOT/src/online2bin/:$KALDI_ROOT/src/ivectorbin/:$KALDI_ROOT/src/lmbin/:$KALDI_ROOT/src/chainbin:$KALDI_ROOT/src/kwsbin:$PWD:$PATH
+export PATH=$PWD/utils/:$KALDI_ROOT/src/bin:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/src/fstbin/:$KALDI_ROOT/src/gmmbin/:$KALDI_ROOT/src/featbin/:$KALDI_ROOT/src/lm/:$KALDI_ROOT/src/sgmmbin/:$KALDI_ROOT/src/sgmm2bin/:$KALDI_ROOT/src/fgmmbin/:$KALDI_ROOT/src/latbin/:$KALDI_ROOT/src/nnetbin:$KALDI_ROOT/src/nnet2bin:$KALDI_ROOT/src/nnet3bin:$KALDI_ROOT/src/online2bin/:$KALDI_ROOT/src/ivectorbin/:$KALDI_ROOT/src/xvectorbin/:$KALDI_ROOT/src/lmbin/:$KALDI_ROOT/src/chainbin:$KALDI_ROOT/src/kwsbin:$PWD:$PATH
 
 export LC_ALL=C
diff --git a/src/xvector/nnet-xvector-diagnostics.h b/src/xvector/nnet-xvector-diagnostics.h
index d43a38a3ed4..046088518b1 100644
--- a/src/xvector/nnet-xvector-diagnostics.h
+++ b/src/xvector/nnet-xvector-diagnostics.h
@@ -18,8 +18,8 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef KALDI_NNET3_NNET_XVECTOR_DIAGNOSTICS_H_
-#define KALDI_NNET3_NNET_XVECTOR_DIAGNOSTICS_H_
+#ifndef KALDI_XVECTOR_NNET_XVECTOR_DIAGNOSTICS_H_
+#define KALDI_XVECTOR_NNET_XVECTOR_DIAGNOSTICS_H_
 
 #include "nnet3/nnet-example.h"
 #include "nnet3/nnet-computation.h"
@@ -89,4 +89,4 @@ class NnetXvectorComputeProb {
 } // namespace nnet3
 } // namespace kaldi
 
-#endif // KALDI_NNET3_NNET_XVECTOR_DIAGNOSTICS_H_
+#endif // KALDI_XVECTOR_NNET_XVECTOR_DIAGNOSTICS_H_
diff --git a/src/xvector/nnet-xvector-training.h b/src/xvector/nnet-xvector-training.h
index e8fb3d20e6a..58ff9211310 100644
--- a/src/xvector/nnet-xvector-training.h
+++ b/src/xvector/nnet-xvector-training.h
@@ -18,8 +18,8 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef KALDI_IVECTOR_NNET_XVECTOR_TRAINING_H_
-#define KALDI_IVECTOR_NNET_XVECTOR_TRAINING_H_
+#ifndef KALDI_XVECTOR_NNET_XVECTOR_TRAINING_H_
+#define KALDI_XVECTOR_NNET_XVECTOR_TRAINING_H_
 
 #include "nnet3/nnet-example.h"
 #include "nnet3/nnet-computation.h"
@@ -86,4 +86,4 @@ void GetComputationRequestXvector(const Nnet &nnet,
 } // namespace nnet3
 } // namespace kaldi
 
-#endif // 
+#endif //
diff --git a/src/xvector/xvector.h b/src/xvector/xvector.h
index 50d58ec7a93..75083533acd 100644
--- a/src/xvector/xvector.h
+++ b/src/xvector/xvector.h
@@ -18,8 +18,8 @@
 // See the Apache 2 License for the specific language governing permissions and
 // limitations under the License.
 
-#ifndef KALDI_IVECTOR_XVECTOR_H_
-#define KALDI_IVECTOR_XVECTOR_H_
+#ifndef KALDI_XVECTOR_XVECTOR_H_
+#define KALDI_XVECTOR_XVECTOR_H_
 
 #include <vector>
 #include "base/kaldi-common.h"
diff --git a/src/xvectorbin/Makefile b/src/xvectorbin/Makefile
index 63a689a1e44..1dc1bee6e0a 100644
--- a/src/xvectorbin/Makefile
+++ b/src/xvectorbin/Makefile
@@ -6,7 +6,8 @@ include ../kaldi.mk
 LDFLAGS += $(CUDA_LDFLAGS)
 LDLIBS += $(CUDA_LDLIBS)
 
-BINFILES = nnet3-xvector-get-egs
+BINFILES = nnet3-xvector-get-egs nnet3-xvector-compute-prob \
+           nnet3-xvector-show-progress nnet3-xvector-train
 
 OBJFILES =
 
@@ -15,7 +16,7 @@ cuda-compiled.o: ../kaldi.mk
 
 TESTFILES =
 
-ADDLIBS = ../nnet3/kaldi-nnet3.a ../gmm/kaldi-gmm.a \
+ADDLIBS = ../xvector/kaldi-xvector.a ../nnet3/kaldi-nnet3.a ../gmm/kaldi-gmm.a \
          ../decoder/kaldi-decoder.a ../lat/kaldi-lat.a ../hmm/kaldi-hmm.a  \
          ../transform/kaldi-transform.a ../tree/kaldi-tree.a \
          ../thread/kaldi-thread.a ../cudamatrix/kaldi-cudamatrix.a \

From c1fac7ef61e4afcd9a5fa07f273edaf06f479cc3 Mon Sep 17 00:00:00 2001
From: David Snyder <dsnyder@b01.clsp.jhu.edu>
Date: Sat, 20 Feb 2016 14:03:30 -0500
Subject: [PATCH 27/32] xvector: fixing Makefiles

---
 src/Makefile         | 9 +++++----
 src/ivector/Makefile | 3 +--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/Makefile b/src/Makefile
index 57a4b98e0aa..c3fe511486f 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -9,13 +9,13 @@ SUBDIRS = base matrix util feat tree thread gmm transform sgmm \
           fstext hmm lm decoder lat kws cudamatrix nnet \
           bin fstbin gmmbin fgmmbin sgmmbin featbin \
           nnetbin latbin sgmm2 sgmm2bin nnet2 nnet3 chain nnet3bin nnet2bin kwsbin \
-          ivector ivectorbin online2 online2bin lmbin chainbin
+          ivector ivectorbin xvector xvectorbin online2 online2bin lmbin chainbin
 
 MEMTESTDIRS = base matrix util feat tree thread gmm transform sgmm \
           fstext hmm lm decoder lat nnet kws chain \
           bin fstbin gmmbin fgmmbin sgmmbin featbin \
           nnetbin latbin sgmm2 nnet2 nnet3 nnet2bin nnet3bin sgmm2bin kwsbin \
-          ivector ivectorbin online2 online2bin lmbin
+          ivector ivectorbin xvector xvectorbin online2 online2bin lmbin
 
 CUDAMEMTESTDIR = cudamatrix
 
@@ -145,9 +145,9 @@ $(EXT_SUBDIRS) : mklibdir
 # this is necessary for correct parallel compilation
 #1)The tools depend on all the libraries
 
-bin fstbin gmmbin fgmmbin sgmmbin sgmm2bin featbin nnetbin nnet2bin nnet3bin chainbin latbin ivectorbin lmbin kwsbin online2bin: \
+bin fstbin gmmbin fgmmbin sgmmbin sgmm2bin featbin nnetbin nnet2bin nnet3bin chainbin latbin ivectorbin xvectorbin lmbin kwsbin online2bin: \
  base matrix util feat tree optimization thread gmm transform sgmm sgmm2 fstext hmm \
- lm decoder lat cudamatrix nnet nnet2 nnet3 ivector
+ lm decoder lat cudamatrix nnet nnet2 nnet3 ivector xvector
 
 #2)The libraries have inter-dependencies
 base:
@@ -172,6 +172,7 @@ nnet2: base util matrix thread lat gmm hmm tree transform cudamatrix
 nnet3: base util matrix thread lat gmm hmm tree transform cudamatrix chain
 chain: lat hmm tree fstext matrix cudamatrix util base
 ivector: base util matrix thread transform tree gmm
+xvector: base util matrix cudamatrix nnet3
 #3)Dependencies for optional parts of Kaldi
 onlinebin: base matrix util feat tree optimization gmm transform sgmm sgmm2 fstext hmm lm decoder lat cudamatrix nnet nnet2 online thread
 # python-kaldi-decoding: base matrix util feat tree optimization thread gmm transform sgmm sgmm2 fstext hmm decoder lat online
diff --git a/src/ivector/Makefile b/src/ivector/Makefile
index bbf4b01faf9..879cc6e69b2 100644
--- a/src/ivector/Makefile
+++ b/src/ivector/Makefile
@@ -15,8 +15,7 @@ OBJFILES = ivector-extractor.o voice-activity-detection.o plda.o logistic-regres
 LIBNAME = kaldi-ivector
 
 ADDLIBS = ../gmm/kaldi-gmm.a ../tree/kaldi-tree.a ../transform/kaldi-transform.a \
-          ../thread/kaldi-thread.a ../nnet3/kaldi-nnet3.a ../cudamatrix/kaldi-cudamatrix.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a \
+          ../thread/kaldi-thread.a ../matrix/kaldi-matrix.a ../base/kaldi-base.a \
           ../util/kaldi-util.a
 
 include ../makefiles/default_rules.mk

From 5cdf82eec7ed11a5269ed4597bb2e5535c5c296e Mon Sep 17 00:00:00 2001
From: David Snyder <dsnyder@b01.clsp.jhu.edu>
Date: Sat, 20 Feb 2016 22:12:23 -0500
Subject: [PATCH 28/32] xvector: Fixes to make_jesus_configs.py

---
 .../s5/steps/nnet3/xvector/make_jesus_configs.py   | 14 +++++++-------
 src/nnet3/nnet-general-component.cc                |  2 +-
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/egs/wsj/s5/steps/nnet3/xvector/make_jesus_configs.py b/egs/wsj/s5/steps/nnet3/xvector/make_jesus_configs.py
index 438f20b083e..51d58c5b89c 100755
--- a/egs/wsj/s5/steps/nnet3/xvector/make_jesus_configs.py
+++ b/egs/wsj/s5/steps/nnet3/xvector/make_jesus_configs.py
@@ -138,7 +138,6 @@ def __init__(self, config_string, input_dim, num_jesus_blocks, input_name):
     def OutputDim(self):
         return self.input_dim * (2 if self.output_stddev else 1) + (self.num_jesus_blocks if self.output_count else 0)
 
-
     # OutputDims() returns an array of output dimensions... this node produces
     # one output node, but this array explains how it's split up into different types
     # of output (which will affect how we reorder the indexes for the jesus-layer).
@@ -168,10 +167,11 @@ def WriteConfigs(self, f):
                 self.input_name, self.left_context, self.right_context), file=f)
         stats_dim = 1 + self.input_dim * (2 if self.output_stddev else 1)
         print('component name={0}-pooling-{1}-{2} type=StatisticsPoolingComponent input-dim={3} '
-              'input-period={4} left-context={1} right-context={2} num-log-count-features=0 '
+              'input-period={4} left-context={1} right-context={2} num-log-count-features={6} '
               'output-stddevs={5} '.format(self.input_name, self.left_context, self.right_context,
                                            stats_dim, self.stats_period,
-                                           ('true' if self.output_stddev else 'false')),
+                                           ('true' if self.output_stddev else 'false'),
+                                           (self.num_jesus_blocks if self.output_count else 0)),
               file=f)
         print('component-node name={0}-pooling-{1}-{2} component={0}-pooling-{1}-{2} input={0}-extraction-{1}-{2} '.format(
                 self.input_name, self.left_context, self.right_context), file=f)
@@ -369,7 +369,6 @@ def WriteConfigs(self, f):
 
     cur_output = 'x-jesus{0}-output-affine'.format(l)
 
-
 print('component name=x-final-relu type=RectifiedLinearComponent dim={0} self-repair-scale={1}'.format(
         cur_affine_output_dim, args.self_repair_scale), file=f)
 print('component-node name=x-final-relu component=x-final-relu input={0}'.format(cur_output),
@@ -394,10 +393,11 @@ def WriteConfigs(self, f):
 # nodes.
 
 # First the S output...
-s_dim = ((args.output_dim)*(args.output_dim+1)) / 2
+s_dim = ((args.output_dim)*(args.output_dim+1))/2
+
 print('component name=x-s type=ConstantFunctionComponent input-dim={0} output-dim={1} '
       'output-mean=0 output-stddev=0 '.format(
-            args.feat_dim, ((args.output_dim)+(args.output_dim+1))/2), file=f)
+            args.feat_dim, s_dim), file=f)
 print('component-node name=x-s component=x-s input=IfDefined(input)',
       file=f)
 print('component name=x-s-scale type=FixedScaleComponent dim={0} scale={1}'.format(
@@ -413,7 +413,7 @@ def WriteConfigs(self, f):
 print('component-node name=x-b component=x-b input=IfDefined(input)', file=f)
 print('component name=x-b-scale type=FixedScaleComponent dim=1 scale={0}'.format(
         args.b_scale), file=f);
-print('component-node name=x-b-scale component=x-b-scale input=input',
+print('component-node name=x-b-scale component=x-b-scale input=x-b',
       file=f)
 print('output-node name=b input=x-b-scale', file=f)
 f.close()
diff --git a/src/nnet3/nnet-general-component.cc b/src/nnet3/nnet-general-component.cc
index 80793bf1d98..f40a750f894 100644
--- a/src/nnet3/nnet-general-component.cc
+++ b/src/nnet3/nnet-general-component.cc
@@ -847,7 +847,7 @@ void StatisticsPoolingComponent::Backprop(
     variance_deriv.Scale(0.5);
 
     // the deriv w.r.t. the uncentered variance is the same as w.r.t.  the
-    // uncentered variance (since they difer by a constant term of -(mean *
+    // uncentered variance (since they differ by a constant term of -(mean *
     // mean), but we need to add to dF/dmean, the value -2.0 * mean *
     // dF/dvariance.
     mean_deriv.AddMatMatElements(-2.0, mean_value, variance_deriv, 1.0);

From 72dfb918a421b65af33ab8d26e19e509a0003f7b Mon Sep 17 00:00:00 2001
From: David Snyder <dsnyder@b01.clsp.jhu.edu>
Date: Sun, 21 Feb 2016 19:56:43 -0500
Subject: [PATCH 29/32] xvector: add binary nnet3-xvector-merge-egs, which is
 the same as nnet3-merge-egs, but it doesn't rely on there being any outputs
 in the examples. Also some minor fixes to the trainging scripts.

---
 egs/wsj/s5/steps/nnet3/xvector/get_egs.sh |   1 +
 egs/wsj/s5/steps/nnet3/xvector/train.sh   |  14 +--
 src/xvectorbin/Makefile                   |   3 +-
 src/xvectorbin/nnet3-xvector-merge-egs.cc | 108 ++++++++++++++++++++++
 4 files changed, 118 insertions(+), 8 deletions(-)
 create mode 100644 src/xvectorbin/nnet3-xvector-merge-egs.cc

diff --git a/egs/wsj/s5/steps/nnet3/xvector/get_egs.sh b/egs/wsj/s5/steps/nnet3/xvector/get_egs.sh
index 2ab81395d47..7c74fff6090 100755
--- a/egs/wsj/s5/steps/nnet3/xvector/get_egs.sh
+++ b/egs/wsj/s5/steps/nnet3/xvector/get_egs.sh
@@ -161,6 +161,7 @@ echo $num_train_frames >$dir/info/num_frames
 num_train_archives=$[($num_train_frames*$num_repeats)/$frames_per_iter + 1]
 echo "$0: producing $num_train_archives archives for training"
 echo $num_train_archives > $dir/info/num_archives
+echo $num_diagnostic_archives > $dir/info/num_diagnostic_archives
 
 
 if [ $nj -gt $num_train_archives ]; then
diff --git a/egs/wsj/s5/steps/nnet3/xvector/train.sh b/egs/wsj/s5/steps/nnet3/xvector/train.sh
index b66c95b3c39..c57d66f7019 100755
--- a/egs/wsj/s5/steps/nnet3/xvector/train.sh
+++ b/egs/wsj/s5/steps/nnet3/xvector/train.sh
@@ -132,12 +132,12 @@ while [ $x -lt $num_iters ]; do
 
     # Set off jobs doing some diagnostics, in the background.
     # Use the egs dir from the previous iteration for the diagnostics
-    $cmd JOB=1:$num_diagnostic_archives $dir/log/compute_prob_valid.$x.log \
-      nnet3-xvector-compute-prob "$dir/$x.raw - |" \
-            "ark:nnet3-merge-egs ark:$egs_dir/valid_diagnostic.JOB.egs ark:- |" &
-    $cmd JOB=1:$num_diagnostic_archives $dir/log/compute_prob_train.$x.log \
-      nnet3-xvector-compute-prob "nnet3-am-copy --raw=true $dir/$x.raw - |" \
-           "ark:nnet3-merge-egs ark:$egs_dir/train_diagnostic.JOB.egs ark:- |" &
+    $cmd JOB=1:$num_diagnostic_archives $dir/log/compute_prob_valid.$x.JOB.log \
+      nnet3-xvector-compute-prob $dir/$x.raw \
+            "ark:nnet3-xvector-merge-egs ark:$egs_dir/valid_diagnostic_egs.JOB.ark ark:- |" &
+    $cmd JOB=1:$num_diagnostic_archives $dir/log/compute_prob_train.$x.JOB.log \
+      nnet3-xvector-compute-prob $dir/$x.raw \
+           "ark:nnet3-xvector-merge-egs ark:$egs_dir/train_diagnostic_egs.JOB.ark ark:- |" &
 
     if [ $x -gt 0 ]; then
       $cmd $dir/log/progress.$x.log \
@@ -176,7 +176,7 @@ while [ $x -lt $num_iters ]; do
           nnet3-xvector-train $parallel_train_opts --print-interval=10 \
           --max-param-change=$max_param_change \
          $dir/$x.raw \
-          "ark:nnet3-copy-egs ark:$egs_dir/egs.$archive.ark ark:- | nnet3-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-| nnet3-merge-egs --minibatch-size=$minibatch_size --measure-output-frames=false --discard-partial-minibatches=true ark:- ark:- |" \
+          "ark:nnet3-copy-egs ark:$egs_dir/egs.$archive.ark ark:- | nnet3-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-| nnet3-xvector-merge-egs --minibatch-size=$minibatch_size --discard-partial-minibatches=true ark:- ark:- |" \
           $dir/$[$x+1].$n.raw || touch $dir/.error &
       done
       wait
diff --git a/src/xvectorbin/Makefile b/src/xvectorbin/Makefile
index 1dc1bee6e0a..e0703ab8422 100644
--- a/src/xvectorbin/Makefile
+++ b/src/xvectorbin/Makefile
@@ -7,7 +7,8 @@ LDFLAGS += $(CUDA_LDFLAGS)
 LDLIBS += $(CUDA_LDLIBS)
 
 BINFILES = nnet3-xvector-get-egs nnet3-xvector-compute-prob \
-           nnet3-xvector-show-progress nnet3-xvector-train
+           nnet3-xvector-show-progress nnet3-xvector-train \
+           nnet3-xvector-merge-egs
 
 OBJFILES =
 
diff --git a/src/xvectorbin/nnet3-xvector-merge-egs.cc b/src/xvectorbin/nnet3-xvector-merge-egs.cc
new file mode 100644
index 00000000000..28dc9d2ee18
--- /dev/null
+++ b/src/xvectorbin/nnet3-xvector-merge-egs.cc
@@ -0,0 +1,108 @@
+// xvectorbin/nnet3-xvector-merge-egs.cc
+
+// Copyright      2016  David Snyder
+//           2012-2015  Johns Hopkins University (author:  Daniel Povey)
+//                2014  Vimal Manohar
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "nnet3/nnet-example.h"
+#include "nnet3/nnet-example-utils.h"
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+
+    const char *usage =
+        "This copies nnet examples for xvector training from input to\n"
+        "output but while doing so it merges many NnetExample objects\n"
+        "into one, forming a minibatch consisting of single NnetExample.\n"
+        "Unlike nnet3-merge-egs, this binary does not expect the examples\n"
+        "to have any output.\n"
+        "\n"
+        "Usage:  nnet3-xvector-merge-egs [options] <egs-rspecifier> "
+        "<egs-wspecifier>\n"
+        "e.g.\n"
+        "nnet3-xvector-merge-egs --minibatch-size=512 ark:1.egs ark:- "
+        "| nnet3-xvector-train ... \n"
+        "See also nnet3-copy-egs and nnet3-merge-egs\n";
+
+    bool compress = false;
+    int32 minibatch_size = 512;
+    bool discard_partial_minibatches = false;
+
+    ParseOptions po(usage);
+    po.Register("minibatch-size", &minibatch_size, "Target size of "
+                "minibatches when merging.");
+    po.Register("compress", &compress, "If true, compress the output examples "
+                "(not recommended unless you are writing to disk)");
+    po.Register("discard-partial-minibatches", &discard_partial_minibatches,
+		"discard any partial minibatches of 'uneven' size that may be "
+		"encountered at the end.");
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 2) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+    std::string examples_rspecifier = po.GetArg(1),
+        examples_wspecifier = po.GetArg(2);
+
+    SequentialNnetExampleReader example_reader(examples_rspecifier);
+    NnetExampleWriter example_writer(examples_wspecifier);
+
+    std::vector<NnetExample> examples;
+    examples.reserve(minibatch_size);
+
+    int32 num_read = 0, num_written = 0;
+    while (!example_reader.Done()) {
+      const NnetExample &cur_eg = example_reader.Value();
+      examples.resize(examples.size() + 1);
+      examples.back() = cur_eg;
+      bool minibatch_ready =
+          static_cast<int32>(examples.size()) >= minibatch_size;
+
+      // Do Next() now, so we can test example_reader.Done() below .
+      example_reader.Next();
+      num_read++;
+
+      if (minibatch_ready || (!discard_partial_minibatches &&
+			      (example_reader.Done() && !examples.empty()))) {
+        NnetExample merged_eg;
+        MergeExamples(examples, compress, &merged_eg);
+        std::ostringstream ostr;
+        ostr << "merged-" << num_written;
+        num_written++;
+        std::string output_key = ostr.str();
+        example_writer.Write(output_key, merged_eg);
+        examples.clear();
+      }
+    }
+    KALDI_LOG << "Merged " << num_read << " egs to " << num_written << '.';
+    return (num_written != 0 ? 0 : 1);
+  } catch(const std::exception &e) {
+    std::cerr << e.what() << '\n';
+    return -1;
+  }
+}
+
+

From a8c0339edb544285fdebb55782cd11f0fe5d2041 Mon Sep 17 00:00:00 2001
From: David Snyder <dsnyder@b01.clsp.jhu.edu>
Date: Sun, 21 Feb 2016 20:14:30 -0500
Subject: [PATCH 30/32] xvector: undoing most of the previous commit--instead
 of creating a separate binary for merging egs for xvectors, the nnet3bin
 version was modified instead to allow for egs without output.

---
 egs/wsj/s5/steps/nnet3/xvector/train.sh   |   6 +-
 src/nnet3bin/nnet3-merge-egs.cc           |  11 ++-
 src/xvectorbin/Makefile                   |   3 +-
 src/xvectorbin/nnet3-xvector-merge-egs.cc | 108 ----------------------
 4 files changed, 10 insertions(+), 118 deletions(-)
 delete mode 100644 src/xvectorbin/nnet3-xvector-merge-egs.cc

diff --git a/egs/wsj/s5/steps/nnet3/xvector/train.sh b/egs/wsj/s5/steps/nnet3/xvector/train.sh
index c57d66f7019..a05c62c5124 100755
--- a/egs/wsj/s5/steps/nnet3/xvector/train.sh
+++ b/egs/wsj/s5/steps/nnet3/xvector/train.sh
@@ -134,10 +134,10 @@ while [ $x -lt $num_iters ]; do
     # Use the egs dir from the previous iteration for the diagnostics
     $cmd JOB=1:$num_diagnostic_archives $dir/log/compute_prob_valid.$x.JOB.log \
       nnet3-xvector-compute-prob $dir/$x.raw \
-            "ark:nnet3-xvector-merge-egs ark:$egs_dir/valid_diagnostic_egs.JOB.ark ark:- |" &
+            "ark:nnet3-merge-egs --measure-output-frames=false ark:$egs_dir/valid_diagnostic_egs.JOB.ark ark:- |" &
     $cmd JOB=1:$num_diagnostic_archives $dir/log/compute_prob_train.$x.JOB.log \
       nnet3-xvector-compute-prob $dir/$x.raw \
-           "ark:nnet3-xvector-merge-egs ark:$egs_dir/train_diagnostic_egs.JOB.ark ark:- |" &
+           "ark:nnet3-merge-egs --measure-output-frames=false ark:$egs_dir/train_diagnostic_egs.JOB.ark ark:- |" &
 
     if [ $x -gt 0 ]; then
       $cmd $dir/log/progress.$x.log \
@@ -176,7 +176,7 @@ while [ $x -lt $num_iters ]; do
           nnet3-xvector-train $parallel_train_opts --print-interval=10 \
           --max-param-change=$max_param_change \
          $dir/$x.raw \
-          "ark:nnet3-copy-egs ark:$egs_dir/egs.$archive.ark ark:- | nnet3-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-| nnet3-xvector-merge-egs --minibatch-size=$minibatch_size --discard-partial-minibatches=true ark:- ark:- |" \
+          "ark:nnet3-copy-egs ark:$egs_dir/egs.$archive.ark ark:- | nnet3-shuffle-egs --buffer-size=$shuffle_buffer_size --srand=$x ark:- ark:-| nnet3-merge-egs --measure-output-frames=false --minibatch-size=$minibatch_size --discard-partial-minibatches=true ark:- ark:- |" \
           $dir/$[$x+1].$n.raw || touch $dir/.error &
       done
       wait
diff --git a/src/nnet3bin/nnet3-merge-egs.cc b/src/nnet3bin/nnet3-merge-egs.cc
index 8627671f53a..fe528486238 100644
--- a/src/nnet3bin/nnet3-merge-egs.cc
+++ b/src/nnet3bin/nnet3-merge-egs.cc
@@ -58,7 +58,7 @@ int main(int argc, char *argv[]) {
         "e.g.\n"
         "nnet3-merge-egs --minibatch-size=512 ark:1.egs ark:- | nnet3-train-simple ... \n"
         "See also nnet3-copy-egs\n";
-        
+
     bool compress = false;
     int32 minibatch_size = 512;
     bool measure_output_frames = true;
@@ -76,7 +76,7 @@ int main(int argc, char *argv[]) {
     po.Register("discard-partial-minibatches", &discard_partial_minibatches,
 		"discard any partial minibatches of 'uneven' size that may be "
 		"encountered at the end.");
-    
+
     po.Read(argc, argv);
 
     if (po.NumArgs() != 2) {
@@ -89,18 +89,19 @@ int main(int argc, char *argv[]) {
 
     SequentialNnetExampleReader example_reader(examples_rspecifier);
     NnetExampleWriter example_writer(examples_wspecifier);
-    
+
     std::vector<NnetExample> examples;
     examples.reserve(minibatch_size);
 
     int32 cur_num_output_frames = 0;
-    
+
     int64 num_read = 0, num_written = 0;
     while (!example_reader.Done()) {
       const NnetExample &cur_eg = example_reader.Value();
       examples.resize(examples.size() + 1);
       examples.back() = cur_eg;
-      cur_num_output_frames += NumOutputIndexes(cur_eg);
+      if (measure_output_frames)
+        cur_num_output_frames += NumOutputIndexes(cur_eg);
       bool minibatch_ready =
           (measure_output_frames ?
            cur_num_output_frames >= minibatch_size :
diff --git a/src/xvectorbin/Makefile b/src/xvectorbin/Makefile
index e0703ab8422..1dc1bee6e0a 100644
--- a/src/xvectorbin/Makefile
+++ b/src/xvectorbin/Makefile
@@ -7,8 +7,7 @@ LDFLAGS += $(CUDA_LDFLAGS)
 LDLIBS += $(CUDA_LDLIBS)
 
 BINFILES = nnet3-xvector-get-egs nnet3-xvector-compute-prob \
-           nnet3-xvector-show-progress nnet3-xvector-train \
-           nnet3-xvector-merge-egs
+           nnet3-xvector-show-progress nnet3-xvector-train
 
 OBJFILES =
 
diff --git a/src/xvectorbin/nnet3-xvector-merge-egs.cc b/src/xvectorbin/nnet3-xvector-merge-egs.cc
deleted file mode 100644
index 28dc9d2ee18..00000000000
--- a/src/xvectorbin/nnet3-xvector-merge-egs.cc
+++ /dev/null
@@ -1,108 +0,0 @@
-// xvectorbin/nnet3-xvector-merge-egs.cc
-
-// Copyright      2016  David Snyder
-//           2012-2015  Johns Hopkins University (author:  Daniel Povey)
-//                2014  Vimal Manohar
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-#include "base/kaldi-common.h"
-#include "util/common-utils.h"
-#include "nnet3/nnet-example.h"
-#include "nnet3/nnet-example-utils.h"
-
-int main(int argc, char *argv[]) {
-  try {
-    using namespace kaldi;
-    using namespace kaldi::nnet3;
-    typedef kaldi::int32 int32;
-
-    const char *usage =
-        "This copies nnet examples for xvector training from input to\n"
-        "output but while doing so it merges many NnetExample objects\n"
-        "into one, forming a minibatch consisting of single NnetExample.\n"
-        "Unlike nnet3-merge-egs, this binary does not expect the examples\n"
-        "to have any output.\n"
-        "\n"
-        "Usage:  nnet3-xvector-merge-egs [options] <egs-rspecifier> "
-        "<egs-wspecifier>\n"
-        "e.g.\n"
-        "nnet3-xvector-merge-egs --minibatch-size=512 ark:1.egs ark:- "
-        "| nnet3-xvector-train ... \n"
-        "See also nnet3-copy-egs and nnet3-merge-egs\n";
-
-    bool compress = false;
-    int32 minibatch_size = 512;
-    bool discard_partial_minibatches = false;
-
-    ParseOptions po(usage);
-    po.Register("minibatch-size", &minibatch_size, "Target size of "
-                "minibatches when merging.");
-    po.Register("compress", &compress, "If true, compress the output examples "
-                "(not recommended unless you are writing to disk)");
-    po.Register("discard-partial-minibatches", &discard_partial_minibatches,
-		"discard any partial minibatches of 'uneven' size that may be "
-		"encountered at the end.");
-
-    po.Read(argc, argv);
-
-    if (po.NumArgs() != 2) {
-      po.PrintUsage();
-      exit(1);
-    }
-
-    std::string examples_rspecifier = po.GetArg(1),
-        examples_wspecifier = po.GetArg(2);
-
-    SequentialNnetExampleReader example_reader(examples_rspecifier);
-    NnetExampleWriter example_writer(examples_wspecifier);
-
-    std::vector<NnetExample> examples;
-    examples.reserve(minibatch_size);
-
-    int32 num_read = 0, num_written = 0;
-    while (!example_reader.Done()) {
-      const NnetExample &cur_eg = example_reader.Value();
-      examples.resize(examples.size() + 1);
-      examples.back() = cur_eg;
-      bool minibatch_ready =
-          static_cast<int32>(examples.size()) >= minibatch_size;
-
-      // Do Next() now, so we can test example_reader.Done() below .
-      example_reader.Next();
-      num_read++;
-
-      if (minibatch_ready || (!discard_partial_minibatches &&
-			      (example_reader.Done() && !examples.empty()))) {
-        NnetExample merged_eg;
-        MergeExamples(examples, compress, &merged_eg);
-        std::ostringstream ostr;
-        ostr << "merged-" << num_written;
-        num_written++;
-        std::string output_key = ostr.str();
-        example_writer.Write(output_key, merged_eg);
-        examples.clear();
-      }
-    }
-    KALDI_LOG << "Merged " << num_read << " egs to " << num_written << '.';
-    return (num_written != 0 ? 0 : 1);
-  } catch(const std::exception &e) {
-    std::cerr << e.what() << '\n';
-    return -1;
-  }
-}
-
-

From 0e744ddadf64a064e33e90bf1741bffeb3fa7925 Mon Sep 17 00:00:00 2001
From: David Snyder <dsnyder@b01.clsp.jhu.edu>
Date: Mon, 22 Feb 2016 14:17:01 -0500
Subject: [PATCH 31/32] xvector: fix to nnet3-xvector-get-egs.cc and fix to
 nnet3-xvector-get-egs.cc (still not working though) and removing trailing
 spaces

---
 src/xvector/nnet-xvector-training.cc    | 26 ++++++++++++++-----------
 src/xvectorbin/nnet3-xvector-get-egs.cc |  4 ++--
 2 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/src/xvector/nnet-xvector-training.cc b/src/xvector/nnet-xvector-training.cc
index 4ef12e5aaca..1dc8e056fd7 100644
--- a/src/xvector/nnet-xvector-training.cc
+++ b/src/xvector/nnet-xvector-training.cc
@@ -51,7 +51,7 @@ NnetXvectorTrainer::NnetXvectorTrainer(const NnetTrainerOptions &config,
       KALDI_WARN << "Could not open cached computation. "
                     "Probably this is the first training iteration.";
     }
-  } 
+  }
 }
 
 
@@ -96,7 +96,7 @@ void NnetXvectorTrainer::Train(const NnetExample &eg) {
   if (config_.write_cache != "") {
     Output ko(config_.write_cache, config_.binary_write_cache);
     compiler_.WriteCache(ko.Stream(), config_.binary_write_cache);
-  } 
+  }
 }
 
 void NnetXvectorTrainer::ProcessOutputs(NnetComputer *computer) {
@@ -104,12 +104,12 @@ void NnetXvectorTrainer::ProcessOutputs(NnetComputer *computer) {
     if (nnet_->IsOutputNode(node_index)) {
       BaseFloat tot_weight, tot_objf;
       bool supply_deriv = true;
-      // For each xvector output node, we expect two output nodes with name "s" 
+      // For each xvector output node, we expect two output nodes with name "s"
       // and "b", which store symmetric affine transformation and bias term
       // for xvector-objective computation.
       std::string xvector_name = nnet_->GetNodeName(node_index),
         s_name = "s", b_name = "b";
-      if (nnet_->GetNodeIndex(s_name) == -1 || nnet_->GetNodeIndex(b_name) == -1) 
+      if (nnet_->GetNodeIndex(s_name) == -1 || nnet_->GetNodeIndex(b_name) == -1)
         KALDI_ERR << "The nnet expected to have two output nodes with name s and b.";
 
       if (xvector_name != s_name && xvector_name != b_name) {
@@ -119,11 +119,11 @@ void NnetXvectorTrainer::ProcessOutputs(NnetComputer *computer) {
         CuMatrix<BaseFloat> xvector_deriv(xvector_pairs.NumRows(), xvector_pairs.NumCols(),
                                           kUndefined);
         int32 s_dim = xvector_pairs.NumCols() * (xvector_pairs.NumCols() + 1) / 2;
-        
-        // convert CuVector to CuSpMatrix 
+
+        // convert CuVector to CuSpMatrix
         CuSpMatrix<BaseFloat> xvec_s_sp(s_dim);
         xvec_s_sp.CopyFromVec(xvec_s.Row(0));
-        
+
         CuVector<BaseFloat> deriv_s(s_dim);
         BaseFloat xvec_b_val = xvec_b(0,0), deriv_b;
         ComputeXvectorObjfAndDeriv(xvector_pairs, xvec_s_sp, xvec_b_val,
@@ -142,7 +142,7 @@ void NnetXvectorTrainer::ProcessOutputs(NnetComputer *computer) {
           computer->AcceptOutputDeriv(s_name, &deriv_s_mat);
           computer->AcceptOutputDeriv(b_name, &deriv_b_mat);
         }
-        
+
         objf_info_[xvector_name].UpdateStats(xvector_name, config_.print_interval,
                                              num_minibatches_processed_++,
                                              tot_weight, tot_objf);
@@ -221,7 +221,7 @@ bool ObjectiveFunctionInfo::PrintTotalStats(const std::string &name) const {
               << (tot_objf / tot_weight) << " over " << tot_weight << " frames.";
   } else {
     KALDI_LOG << "Overall average objective function for '" << name << "' is "
-              << objf << " + " << aux_objf << " = " << sum_objf        
+              << objf << " + " << aux_objf << " = " << sum_objf
               << " over " << tot_weight << " frames.";
   }
   KALDI_LOG << "[this line is to be parsed by a script:] "
@@ -245,33 +245,36 @@ void GetComputationRequestXvector(const Nnet &nnet,
   request->outputs.reserve(eg.io.size());
   request->need_model_derivative = need_model_derivative;
   request->store_component_stats = store_component_stats;
+
   // xvector-egs have multiple inputs(e.g. different inputs correspond
   // to different chunks and no outputs.
   for (size_t i = 0; i < eg.io.size(); i++) {
     const NnetIo &io = eg.io[i];
     const std::string &name = io.name;
     int32 node_index = nnet.GetNodeIndex(name);
+
     if (node_index == -1 &&
         !nnet.IsInputNode(node_index))
       KALDI_ERR << "xvector example has input  named '" << name
                 << "', but no such input node is in the network.";
 
     std::vector<IoSpecification> &dest = request->inputs;
-    //    nnet.IsInputNode(node_index) ? request->inputs : request->outputs;
     dest.resize(dest.size() + 1);
     IoSpecification &io_spec = dest.back();
     io_spec.name = name;
     io_spec.indexes = io.indexes;
     io_spec.has_deriv = nnet.IsOutputNode(node_index) && need_model_derivative;
   }
+
   // We only need the output on frame t=0 for each n.
   int32 io_index_size = request->inputs[0].indexes.size();
   std::vector<Index> output_indexes;
   output_indexes.resize(io_index_size);
-  for (int32 ind = 0; io_index_size; ind++) {
+  for (int32 ind = 0; ind < io_index_size; ind++) {
     output_indexes[ind].n = ind;
     output_indexes[ind].t = 0;
   }
+
   // In order to generate computation request for output nodes,
   // we should find output nodes and add io_spec for each one.
   int32 num_nodes = nnet.NumNodes();
@@ -285,6 +288,7 @@ void GetComputationRequestXvector(const Nnet &nnet,
       io_spec.has_deriv = need_model_derivative;
     }
   }
+
   // check to see if something went wrong.
   if (request->inputs.empty())
     KALDI_ERR << "No inputs in computation request.";
diff --git a/src/xvectorbin/nnet3-xvector-get-egs.cc b/src/xvectorbin/nnet3-xvector-get-egs.cc
index fd1dc2e943b..8c9610f9429 100644
--- a/src/xvectorbin/nnet3-xvector-get-egs.cc
+++ b/src/xvectorbin/nnet3-xvector-get-egs.cc
@@ -110,8 +110,8 @@ static void WriteExamples(const MatrixBase<BaseFloat> &feats,
                                   pair->num_frames1, 0, feat_dim),
                            chunk2(feats, pair->start_frame2 + shift2,
                                   pair->num_frames2, 0, feat_dim);
-      NnetIo nnet_io1 = NnetIo("input1", 0, chunk1),
-             nnet_io2 = NnetIo("input2", 0, chunk2);
+      NnetIo nnet_io1 = NnetIo("input", 0, chunk1),
+             nnet_io2 = NnetIo("input", 0, chunk2);
       for (std::vector<Index>::iterator indx_it = nnet_io1.indexes.begin();
           indx_it != nnet_io1.indexes.end(); ++indx_it)
         indx_it->n = 0;

From fbfc27bfb4929f59b0d294c900024671bfca1060 Mon Sep 17 00:00:00 2001
From: David Snyder <dsnyder@a16.clsp.jhu.edu>
Date: Mon, 22 Feb 2016 16:18:02 -0500
Subject: [PATCH 32/32] xvector: fix in nnet-xvector-training.cc

---
 src/xvector/nnet-xvector-diagnostics.cc | 12 ++++++------
 src/xvector/nnet-xvector-training.cc    | 15 ++++++++++-----
 2 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/src/xvector/nnet-xvector-diagnostics.cc b/src/xvector/nnet-xvector-diagnostics.cc
index 03a018dc66d..820096118c1 100644
--- a/src/xvector/nnet-xvector-diagnostics.cc
+++ b/src/xvector/nnet-xvector-diagnostics.cc
@@ -76,11 +76,11 @@ void NnetXvectorComputeProb::Compute(const NnetExample &eg) {
 }
 
 void NnetXvectorComputeProb::ProcessOutputs(NnetComputer *computer) {
-  for (int32 node_index = 0; node_index < nnet_.NumNodes(); node_index++) { 
-    if (nnet_.IsOutputNode(node_index)) { 
+  for (int32 node_index = 0; node_index < nnet_.NumNodes(); node_index++) {
+    if (nnet_.IsOutputNode(node_index)) {
       std::string xvector_name = nnet_.GetNodeName(node_index),
         s_name = "s", b_name = "b";
-      if (nnet_.GetNodeIndex(s_name) == -1 || nnet_.GetNodeIndex(b_name) == -1) 
+      if (nnet_.GetNodeIndex(s_name) == -1 || nnet_.GetNodeIndex(b_name) == -1)
         KALDI_ERR << "The nnet expected to have two output nodes with name s and b.";
 
       if (xvector_name != s_name && xvector_name != b_name) {
@@ -90,11 +90,11 @@ void NnetXvectorComputeProb::ProcessOutputs(NnetComputer *computer) {
         CuMatrix<BaseFloat> xvector_deriv(xvector_pairs.NumRows(), xvector_pairs.NumCols(),
                                           kUndefined);
         int32 s_dim = xvector_pairs.NumCols() * (xvector_pairs.NumCols() + 1) / 2;
-        
-        // convert CuVector to CuSpMatrix 
+
+        // convert CuVector to CuSpMatrix
         CuSpMatrix<BaseFloat> xvec_s_sp(s_dim);
         xvec_s_sp.CopyFromVec(xvec_s.Row(0));
-        
+
         CuVector<BaseFloat> deriv_s(s_dim);
         BaseFloat xvec_b_val = xvec_b(0,0), deriv_b;
         BaseFloat tot_weight, tot_objf;
diff --git a/src/xvector/nnet-xvector-training.cc b/src/xvector/nnet-xvector-training.cc
index 1dc8e056fd7..9abd62937f3 100644
--- a/src/xvector/nnet-xvector-training.cc
+++ b/src/xvector/nnet-xvector-training.cc
@@ -267,12 +267,17 @@ void GetComputationRequestXvector(const Nnet &nnet,
   }
 
   // We only need the output on frame t=0 for each n.
-  int32 io_index_size = request->inputs[0].indexes.size();
+  int32 io_index_size = request->inputs[0].indexes.size(),
+         n_indx_size = 0;
   std::vector<Index> output_indexes;
-  output_indexes.resize(io_index_size);
-  for (int32 ind = 0; ind < io_index_size; ind++) {
-    output_indexes[ind].n = ind;
-    output_indexes[ind].t = 0;
+  for (int32 indx = 0; indx < io_index_size; indx++)
+    if (request->inputs[0].indexes[indx].t == 0)
+     n_indx_size++;
+
+  output_indexes.resize(n_indx_size);
+  for (int32 indx = 0; indx < n_indx_size; indx++) {
+    output_indexes[indx].n = indx;
+    output_indexes[indx].t = 0;
   }
 
   // In order to generate computation request for output nodes,