From 19355bff3509ccdd384dd41559e1a991d816eb71 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sun, 24 Feb 2019 17:25:31 -0500
Subject: [PATCH 1/2] [scripts] Add missing option as default in nnet3
 model-cleanup script

---
 .../cleanup/segment_long_utterances_nnet3.sh  | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)
diff --git a/egs/wsj/s5/steps/cleanup/segment_long_utterances_nnet3.sh b/egs/wsj/s5/steps/cleanup/segment_long_utterances_nnet3.sh
index ae355c9f753..27577f2b17e 100755
--- a/egs/wsj/s5/steps/cleanup/segment_long_utterances_nnet3.sh
+++ b/egs/wsj/s5/steps/cleanup/segment_long_utterances_nnet3.sh
@@ -4,7 +4,7 @@
 #           2016  Vimal Manohar
 # Apache 2.0
 
-# This script is similar to steps/cleanup/segment_long_utterances.sh, but 
+# This script is similar to steps/cleanup/segment_long_utterances.sh, but
 # uses nnet3 acoustic model instead of GMM acoustic model for decoding.
 # This script performs segmentation of the input data based on the transcription
 # and outputs segmented data along with the corresponding aligned transcription.
@@ -13,7 +13,7 @@
 # are of manageable length for further processing, along with the portion of the
 # transcript that seems to match (aligns with) each segment.
 # This the light-supervised training scenario where the input transcription is
-# not expected to be completely clean and may have significant errors. 
+# not expected to be completely clean and may have significant errors.
 # See "JHU Kaldi System for Arabic MGB-3 ASR Challenge using Diarization,
 # Audio-transcript Alignment and Transfer Learning": Vimal Manohar, Daniel
 # Povey, Sanjeev Khudanpur, ASRU 2017
@@ -49,14 +49,14 @@ post_decode_acwt=1.0  # can be used in 'chain' systems to scale acoustics by 10
 
 # Contexts must ideally match training
 extra_left_context=0  # Set to some large value, typically 40 for LSTM (must match training)
-extra_right_context=0  
+extra_right_context=0
 extra_left_context_initial=-1
 extra_right_context_final=-1
 frames_per_chunk=150
 
 # i-vector options
-extractor=    # i-Vector extractor. If provided, will extract i-vectors. 
-              # Required if the network was trained with i-vector extractor. 
+extractor=    # i-Vector extractor. If provided, will extract i-vectors.
+              # Required if the network was trained with i-vector extractor.
 use_vad=false # Use energy-based VAD for i-vector extraction
 
 # TF-IDF similarity search options
@@ -116,12 +116,12 @@ it and eliminate data where the transcript doesn't seem to match.
     --segmentation-extra-opts 'opts'  # Additional options to segment_ctm_edits_mild.py.
                                 # Please run steps/cleanup/internal/segment_ctm_edits_mild.py
                                 # without arguments to see allowed options.
-    --align-full-hyp <true|false>  # If true, align full hypothesis 
-                                   i.e. trackback from the end to get the alignment. 
-                                   This is different from the normal 
+    --align-full-hyp <true|false>  # If true, align full hypothesis
+                                   i.e. trackback from the end to get the alignment.
+                                   This is different from the normal
                                    Smith-Waterman alignment, where the
                                    traceback will be from the maximum score.
-    --extractor <extractor>     # i-vector extractor directory if i-vector is 
+    --extractor <extractor>     # i-vector extractor directory if i-vector is
                                 # to be used during decoding. Must match
                                 # the extractor used for training neural-network.
     --use-vad <true|false>      # If true, uses energy-based VAD to apply frame weights
@@ -221,6 +221,7 @@ if [ $stage -le 3 ]; then
 
   # Make graphs w.r.t. to the original text (usually recording-level)
   steps/cleanup/make_biased_lm_graphs.sh $graph_opts \
+    --scale-opts "--self-loop-scale=1.0 --transition-scale=1.0" \
     --nj $nj --cmd "$cmd" $text \
     $lang $dir $dir/graphs
   if [ -z "$utt2text" ]; then

From 974fa81f74709d87c32e19248355ef8c39fadc44 Mon Sep 17 00:00:00 2001
From: Daniel Povey <dpovey@gmail.com>
Date: Sun, 24 Feb 2019 17:41:52 -0500
Subject: [PATCH 2/2] [scripts] Make sure nnet3 segmentation opts set right for
 chain systems

---
 .../cleanup/segment_long_utterances_nnet3.sh  | 26 +++++++++++++++----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/egs/wsj/s5/steps/cleanup/segment_long_utterances_nnet3.sh b/egs/wsj/s5/steps/cleanup/segment_long_utterances_nnet3.sh
index 27577f2b17e..751200bdf83 100755
--- a/egs/wsj/s5/steps/cleanup/segment_long_utterances_nnet3.sh
+++ b/egs/wsj/s5/steps/cleanup/segment_long_utterances_nnet3.sh
@@ -4,6 +4,7 @@
 #           2016  Vimal Manohar
 # Apache 2.0
 
+
 # This script is similar to steps/cleanup/segment_long_utterances.sh, but
 # uses nnet3 acoustic model instead of GMM acoustic model for decoding.
 # This script performs segmentation of the input data based on the transcription
@@ -39,13 +40,11 @@ seconds_per_spk_max=30
 
 # Decode options
 graph_opts=
+scale_opts=  # for making the graphs
 beam=15.0
 lattice_beam=1.0
 lmwt=10
-
 acwt=0.1  # Just a default value, used for adaptation and beam-pruning..
-post_decode_acwt=1.0  # can be used in 'chain' systems to scale acoustics by 10 so the
-                      # regular scoring script works.
 
 # Contexts must ideally match training
 extra_left_context=0  # Set to some large value, typically 40 for LSTM (must match training)
@@ -168,6 +167,23 @@ cp $srcdir/cmvn_opts $dir
 cp $srcdir/{splice_opts,delta_opts,final.mat,final.alimdl} $dir 2>/dev/null || true
 cp $srcdir/frame_subsampling_factor $dir 2>/dev/null || true
 
+if [ -f $srcdir/frame_subsampling_factor ]; then
+  echo "$0: guessing that this is a chain system, checking parameters."
+  if [ -z $scale_opts ]; then
+    echo "$0: setting scale_opts"
+    scale_opts="--self-loop-scale=1.0 --transition-scale=1.0"
+  fi
+  if [ $acwt == 0.1 ]; then
+    echo "$0: setting acwt=1.0"
+    acwt=1.0
+  fi
+  if [ $lmwt == 10 ]; then
+    echo "$0: setting lmwt=1.0"
+    lmwt=1
+  fi
+fi
+
+
 utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt
 cp $lang/phones.txt $dir
 
@@ -221,7 +237,7 @@ if [ $stage -le 3 ]; then
 
   # Make graphs w.r.t. to the original text (usually recording-level)
   steps/cleanup/make_biased_lm_graphs.sh $graph_opts \
-    --scale-opts "--self-loop-scale=1.0 --transition-scale=1.0" \
+    --scale-opts "$scale_opts" \
     --nj $nj --cmd "$cmd" $text \
     $lang $dir $dir/graphs
   if [ -z "$utt2text" ]; then
@@ -268,7 +284,7 @@ if [ $stage -le 5 ]; then
   echo "$0: Decoding with biased language models..."
 
   steps/cleanup/decode_segmentation_nnet3.sh \
-    --acwt $acwt --post-decode-acwt $post_decode_acwt \
+    --acwt $acwt \
     --beam $beam --lattice-beam $lattice_beam --nj $nj --cmd "$cmd --mem 4G" \
     --skip-scoring true --allow-partial false \
     --extra-left-context $extra_left_context \