From 19355bff3509ccdd384dd41559e1a991d816eb71 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Sun, 24 Feb 2019 17:25:31 -0500 Subject: [PATCH 1/2] [scripts] Add missing option as default in nnet3 model-cleanup script --- .../cleanup/segment_long_utterances_nnet3.sh | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/egs/wsj/s5/steps/cleanup/segment_long_utterances_nnet3.sh b/egs/wsj/s5/steps/cleanup/segment_long_utterances_nnet3.sh index ae355c9f753..27577f2b17e 100755 --- a/egs/wsj/s5/steps/cleanup/segment_long_utterances_nnet3.sh +++ b/egs/wsj/s5/steps/cleanup/segment_long_utterances_nnet3.sh @@ -4,7 +4,7 @@ # 2016 Vimal Manohar # Apache 2.0 -# This script is similar to steps/cleanup/segment_long_utterances.sh, but +# This script is similar to steps/cleanup/segment_long_utterances.sh, but # uses nnet3 acoustic model instead of GMM acoustic model for decoding. # This script performs segmentation of the input data based on the transcription # and outputs segmented data along with the corresponding aligned transcription. @@ -13,7 +13,7 @@ # are of manageable length for further processing, along with the portion of the # transcript that seems to match (aligns with) each segment. # This the light-supervised training scenario where the input transcription is -# not expected to be completely clean and may have significant errors. +# not expected to be completely clean and may have significant errors. # See "JHU Kaldi System for Arabic MGB-3 ASR Challenge using Diarization, # Audio-transcript Alignment and Transfer Learning": Vimal Manohar, Daniel # Povey, Sanjeev Khudanpur, ASRU 2017 @@ -49,14 +49,14 @@ post_decode_acwt=1.0 # can be used in 'chain' systems to scale acoustics by 10 # Contexts must ideally match training extra_left_context=0 # Set to some large value, typically 40 for LSTM (must match training) -extra_right_context=0 +extra_right_context=0 extra_left_context_initial=-1 extra_right_context_final=-1 frames_per_chunk=150 # i-vector options -extractor= # i-Vector extractor. If provided, will extract i-vectors. - # Required if the network was trained with i-vector extractor. +extractor= # i-Vector extractor. If provided, will extract i-vectors. + # Required if the network was trained with i-vector extractor. use_vad=false # Use energy-based VAD for i-vector extraction # TF-IDF similarity search options @@ -116,12 +116,12 @@ it and eliminate data where the transcript doesn't seem to match. --segmentation-extra-opts 'opts' # Additional options to segment_ctm_edits_mild.py. # Please run steps/cleanup/internal/segment_ctm_edits_mild.py # without arguments to see allowed options. - --align-full-hyp # If true, align full hypothesis - i.e. trackback from the end to get the alignment. - This is different from the normal + --align-full-hyp # If true, align full hypothesis + i.e. trackback from the end to get the alignment. + This is different from the normal Smith-Waterman alignment, where the traceback will be from the maximum score. - --extractor # i-vector extractor directory if i-vector is + --extractor # i-vector extractor directory if i-vector is # to be used during decoding. Must match # the extractor used for training neural-network. --use-vad # If true, uses energy-based VAD to apply frame weights @@ -221,6 +221,7 @@ if [ $stage -le 3 ]; then # Make graphs w.r.t. to the original text (usually recording-level) steps/cleanup/make_biased_lm_graphs.sh $graph_opts \ + --scale-opts "--self-loop-scale=1.0 --transition-scale=1.0" \ --nj $nj --cmd "$cmd" $text \ $lang $dir $dir/graphs if [ -z "$utt2text" ]; then From 974fa81f74709d87c32e19248355ef8c39fadc44 Mon Sep 17 00:00:00 2001 From: Daniel Povey Date: Sun, 24 Feb 2019 17:41:52 -0500 Subject: [PATCH 2/2] [scripts] Make sure nnet3 segmentation opts set right for chain systems --- .../cleanup/segment_long_utterances_nnet3.sh | 26 +++++++++++++++---- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/egs/wsj/s5/steps/cleanup/segment_long_utterances_nnet3.sh b/egs/wsj/s5/steps/cleanup/segment_long_utterances_nnet3.sh index 27577f2b17e..751200bdf83 100755 --- a/egs/wsj/s5/steps/cleanup/segment_long_utterances_nnet3.sh +++ b/egs/wsj/s5/steps/cleanup/segment_long_utterances_nnet3.sh @@ -4,6 +4,7 @@ # 2016 Vimal Manohar # Apache 2.0 + # This script is similar to steps/cleanup/segment_long_utterances.sh, but # uses nnet3 acoustic model instead of GMM acoustic model for decoding. # This script performs segmentation of the input data based on the transcription @@ -39,13 +40,11 @@ seconds_per_spk_max=30 # Decode options graph_opts= +scale_opts= # for making the graphs beam=15.0 lattice_beam=1.0 lmwt=10 - acwt=0.1 # Just a default value, used for adaptation and beam-pruning.. -post_decode_acwt=1.0 # can be used in 'chain' systems to scale acoustics by 10 so the - # regular scoring script works. # Contexts must ideally match training extra_left_context=0 # Set to some large value, typically 40 for LSTM (must match training) @@ -168,6 +167,23 @@ cp $srcdir/cmvn_opts $dir cp $srcdir/{splice_opts,delta_opts,final.mat,final.alimdl} $dir 2>/dev/null || true cp $srcdir/frame_subsampling_factor $dir 2>/dev/null || true +if [ -f $srcdir/frame_subsampling_factor ]; then + echo "$0: guessing that this is a chain system, checking parameters." + if [ -z $scale_opts ]; then + echo "$0: setting scale_opts" + scale_opts="--self-loop-scale=1.0 --transition-scale=1.0" + fi + if [ $acwt == 0.1 ]; then + echo "$0: setting acwt=1.0" + acwt=1.0 + fi + if [ $lmwt == 10 ]; then + echo "$0: setting lmwt=1.0" + lmwt=1 + fi +fi + + utils/lang/check_phones_compatible.sh $lang/phones.txt $srcdir/phones.txt cp $lang/phones.txt $dir @@ -221,7 +237,7 @@ if [ $stage -le 3 ]; then # Make graphs w.r.t. to the original text (usually recording-level) steps/cleanup/make_biased_lm_graphs.sh $graph_opts \ - --scale-opts "--self-loop-scale=1.0 --transition-scale=1.0" \ + --scale-opts "$scale_opts" \ --nj $nj --cmd "$cmd" $text \ $lang $dir $dir/graphs if [ -z "$utt2text" ]; then @@ -268,7 +284,7 @@ if [ $stage -le 5 ]; then echo "$0: Decoding with biased language models..." steps/cleanup/decode_segmentation_nnet3.sh \ - --acwt $acwt --post-decode-acwt $post_decode_acwt \ + --acwt $acwt \ --beam $beam --lattice-beam $lattice_beam --nj $nj --cmd "$cmd --mem 4G" \ --skip-scoring true --allow-partial false \ --extra-left-context $extra_left_context \