Merge branch '23-rebase-upstream-02-13-20' into 'master'

Resolve "Rebase upstream 02/13/20" Closes kaldi-asr#23 See merge request dl/dgx/kaldi!69
galv · Feb 14, 2020 · dc73e74 · dc73e74
2 parents a2271cc + de6d8f0
commit dc73e74
Show file tree

Hide file tree

Showing 29 changed files with 1,442 additions and 144 deletions.
diff --git a/.gitignore b/.gitignore
@@ -151,6 +151,8 @@ GSYMS
 /tools/cub-1.8.0/
 /tools/cub
 /tools/python/
+/tools/ngram-1.3.7.tar.gz
+/tools/ngram-1.3.7/
 
 # These CMakeLists.txt files are all genareted on the fly at the moment.
 # They are added here to avoid accidently checkin.

diff --git a/egs/gale_arabic/s5d/RESULTS b/egs/gale_arabic/s5d/RESULTS
@@ -0,0 +1,19 @@
+tri1
+%WER 40.91 [ 32272 / 78894, 2147 ins, 7478 del, 22647 sub ] exp/tri1/decode/wer_12_0.5
+tri2b
+%WER 36.68 [ 28936 / 78894, 2752 ins, 5682 del, 20502 sub ] exp/tri2b/decode/wer_13_0.0
+tri3b
+%WER 35.35 [ 27892 / 78894, 2587 ins, 7024 del, 18281 sub ] exp/tri3b/decode/wer_14_0.0
+
+chain for dev set
+%WER 16.60 [ 13094 / 78894, 1314 ins, 2992 del, 8788 sub ] exp/chain/tdnn_1a_sp/decode_dev/wer_9_0.0
+rnnlm-rescoring for dev set
+%WER 15.02 [ 11846 / 78894, 1248 ins, 2836 del, 7762 sub ] exp/chain/tdnn_1a_sp/decode_dev_rnnlm_1e_0.45/wer_9_0.0
+
+chain for test_p2 set
+%WER 14.95 [ 10416 / 69668, 1129 ins, 2593 del, 6694 sub ] exp/chain/tdnn_1a_sp/decode_test_p2/wer_9_0.0
+rnnlm-rescoring for test_p2 set
+%WER 13.51 [ 9413 / 69668, 1059 ins, 2517 del, 5837 sub ] exp/chain/tdnn_1a_sp/decode_test_p2_rnnlm_1e_0.45/wer_9_0.0
+
+rnnlm-rescoring for mt_eval set
+%WER 12.02 [ 10829 / 90112, 1483 ins, 2401 del, 6945 sub ] exp/chain/tdnn_1a_sp/decode_mt_all_rnnlm_1e_0.45/wer_9_0.0
diff --git a/egs/gale_arabic/s5d/local/gale_train_lms.sh b/egs/gale_arabic/s5d/local/gale_train_lms.sh
@@ -43,8 +43,7 @@ lexicon=$2  # data/local/dict/lexicon.txt
 dir=$3      # data/local/lm
 
 shift 3
-giga_dir=( $@ )
-[ -z $giga_dir ] && echo "Training LM without using external Arabic Gigaword."
+giga_dirs=( $@ )
 
 for f in "$text" "$lexicon"; do
   [ ! -f $x ] && echo "$0: No such file $f" && exit 1;
@@ -95,43 +94,44 @@ if [ $stage -le 1 ]; then
   echo "training 4-gram lm"
   ngram-count -text $dir/train.gz -order 4 -limit-vocab -vocab $dir/wordlist \
     -unk -map-unk "<UNK>" -${smoothing}discount -interpolate -lm $dir/gale.o4g.${smoothing}.gz
-  echo "PPL for SWBD1 4gram LM:"
+  echo "PPL for GALE Arabic 4gram LM:"
   ngram -unk -lm $dir/gale.o4g.${smoothing}.gz -ppl $dir/heldout
   ngram -unk -lm $dir/gale.o4g.${smoothing}.gz -ppl $dir/heldout -debug 2 >& $dir/4gram.${smoothing}.ppl2
 fi
 
+if [ ! -z $giga_dirs ]; then
+  mkdir -p $dir/giga
+  if [ ! -f $giga_dirs/text.2000k ]; then
+    echo "Arabic Gigaword text not found, prepare it"
+    local/prepare_giga.sh $giga_dirs
+  fi
 
-if [ $stage -le 2 ]; then
-  if [ ! -z $giga_dir ]; then
-    echo "Using external data."
-    mkdir -p $dir/giga
-    cp $giga_dir/text.2000k $dir/giga
-    cat $dir/giga/text.2000k | gzip -c > $dir/giga/text2000k.gz
+  cp $giga_dirs/text.2000k $dir/giga
+  cat $dir/giga/text.2000k | gzip -c > $dir/giga/text2000k.gz
 
-    for x in 3 4; do
-      smoothing="kn"
-      ngram-count -text $dir/giga/text2000k.gz -order $x -limit-vocab \
-        -vocab $dir/wordlist -unk -map-unk "<UNK>" -${smoothing}discount -interpolate \
-        -lm $dir/giga/giga.o${x}g.${smoothing}.gz
-      echo "PPL for Gigaword ${x}gram LM:"
-      ngram -unk -lm $dir/giga/giga.o${x}g.${smoothing}.gz -ppl $dir/heldout
-      ngram -unk -lm $dir/giga/giga.o${x}g.${smoothing}.gz -ppl $dir/heldout -debug 2 \
-        >& $dir/giga/${x}gram.${smoothing}.ppl2
-      compute-best-mix $dir/${x}gram.${smoothing}.ppl2 \
-        $dir/giga/${x}gram.${smoothing}.ppl2 >& $dir/gale_giga_mix.${x}gram.${smoothing}.log
-      grep 'best lambda' $dir/gale_giga_mix.${x}gram.${smoothing}.log | perl -e '
-        $_=<>;
-        s/.*\(//; s/\).*//;
-        @A = split;
-        die "Expecting 2 numbers; found: $_" if(@A!=2);
-        print "$A[0]\n$A[1]\n";' > $dir/gale_giga_mix.${x}gram.${smoothing}.weights
-      gale_weight=$(head -1 $dir/gale_giga_mix.${x}gram.${smoothing}.weights)
-      giga_weight=$(tail -n 1 $dir/gale_giga_mix.${x}gram.${smoothing}.weights)
-      ngram -order $x -lm $dir/gale.o${x}g.${smoothing}.gz -lambda $swb1_weight \
-        -mix-lm $dir/giga/giga.o${x}g.${smoothing}.gz \
-        -unk -write-lm $dir/gale_giga.o${x}g.${smoothing}.gz
-      echo "PPL for GALE + Gigaword ${x}gram LM:"
-      ngram -unk -lm $dir/gale_giga.o${x}g.${smoothing}.gz -ppl $dir/heldout
-    done
-  fi
+  for x in 3 4; do
+    smoothing="kn"
+    ngram-count -text $dir/giga/text2000k.gz -order $x -limit-vocab \
+      -vocab $dir/wordlist -unk -map-unk "<UNK>" -${smoothing}discount -interpolate \
+      -lm $dir/giga/giga.o${x}g.${smoothing}.gz
+    echo "PPL for Gigaword ${x}gram LM:"
+    ngram -unk -lm $dir/giga/giga.o${x}g.${smoothing}.gz -ppl $dir/heldout
+    ngram -unk -lm $dir/giga/giga.o${x}g.${smoothing}.gz -ppl $dir/heldout -debug 2 \
+      >& $dir/giga/${x}gram.${smoothing}.ppl2
+    compute-best-mix $dir/${x}gram.${smoothing}.ppl2 \
+      $dir/giga/${x}gram.${smoothing}.ppl2 >& $dir/gale_giga_mix.${x}gram.${smoothing}.log
+    grep 'best lambda' $dir/gale_giga_mix.${x}gram.${smoothing}.log | perl -e '
+      $_=<>;
+      s/.*\(//; s/\).*//;
+      @A = split;
+      die "Expecting 2 numbers; found: $_" if(@A!=2);
+      print "$A[0]\n$A[1]\n";' > $dir/gale_giga_mix.${x}gram.${smoothing}.weights
+    gale_weight=$(head -1 $dir/gale_giga_mix.${x}gram.${smoothing}.weights)
+    giga_weight=$(tail -n 1 $dir/gale_giga_mix.${x}gram.${smoothing}.weights)
+    ngram -order $x -lm $dir/gale.o${x}g.${smoothing}.gz -lambda $swb1_weight \
+      -mix-lm $dir/giga/giga.o${x}g.${smoothing}.gz \
+      -unk -write-lm $dir/gale_giga.o${x}g.${smoothing}.gz
+    echo "PPL for GALE + Gigaword ${x}gram LM:"
+    ngram -unk -lm $dir/gale_giga.o${x}g.${smoothing}.gz -ppl $dir/heldout
+  done
 fi
diff --git a/egs/gale_arabic/s5d/local/prepare_giga.sh b/egs/gale_arabic/s5d/local/prepare_giga.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+giga_dir=$1
+
+source_dir=/export/corpora/LDC/LDC2011T11/arb_gw_5
+num=2000000
+suffix="2000k"
+
+[ ! -d $source_dir ] && echo "source Arabic Gigaword does not exist." && exit 1;
+
+[ -f $giga_dir/text ] && mv $giga_dir/text $giga_dir/text.bkp
+mkdir -p $giga_dir/
+
+find $source_dir/data/ -name "*.gz" | while read file; do
+  gunzip -c $file | local/arabic_convert.py - >> $giga_dir/text.arb
+done
+
+head -n $num $giga_dir/text.arb > $giga_dir/text.arb.${suffix}
+local/normalize_transcript_BW.pl $giga_dir/text.arb.${suffix} $giga_dir/text.${suffix}
+
+echo "finish preparing Arabic Gigaword"
+exit 0
diff --git a/egs/librispeech/s5/local/lookahead/run_lookahead.sh b/egs/librispeech/s5/local/lookahead/run_lookahead.sh
@@ -0,0 +1,87 @@
+#!/bin/bash
+
+. ./path.sh
+
+# Example script for lookahead composition
+
+lm=tgmed
+am=exp/chain_cleaned/tdnn_1d_sp
+testset=test_clean
+
+# %WER 4.86 [ 2553 / 52576, 315 ins, 222 del, 2016 sub ] exp/chain_cleaned/tdnn_1d_sp/decode_test_clean_lookahead/wer_11_0.0
+# %WER 4.79 [ 2518 / 52576, 279 ins, 292 del, 1947 sub ] exp/chain_cleaned/tdnn_1d_sp/decode_test_clean_lookahead_arpa/wer_11_0.0
+# %WER 4.82 [ 2532 / 52576, 286 ins, 290 del, 1956 sub ] exp/chain_cleaned/tdnn_1d_sp/decode_test_clean_lookahead_arpa_fast/wer_11_0.0
+# %WER 4.86 [ 2553 / 52576, 314 ins, 222 del, 2017 sub ] exp/chain_cleaned/tdnn_1d_sp/decode_test_clean_lookahead_base/wer_11_0.0
+# %WER 4.86 [ 2553 / 52576, 315 ins, 222 del, 2016 sub ] exp/chain_cleaned/tdnn_1d_sp/decode_test_clean_lookahead_static/wer_11_0.0
+
+
+# Speed
+#
+# base       0.18 xRT
+# static     0.18 xRT
+# lookahead  0.29 xRT
+# arpa       0.35 xRT
+# arpa_fast  0.21 xRT
+
+# Graph size
+#
+# Base                 476 Mb
+# Static               621 Mb
+# Lookahead            48 Mb HCL + 77 Mb Grammar
+# Lookahead + OpenGrm  48 Mb HCL + 42 Mb Grammar
+
+if [ ! -f "${KALDI_ROOT}/tools/openfst/lib/libfstlookahead.so" ]; then
+    echo "Missing ${KALDI_ROOT}/tools/openfst/lib/libfstlookahead.so"
+    echo "Make sure you compiled openfst with lookahead support. Run make in ${KALDI_ROOT}/tools after git pull."
+    exit 1
+fi
+if [ ! -f "${KALDI_ROOT}/tools/openfst/bin/ngramread" ]; then
+    echo "You appear to not have OpenGRM tools installed. Missing ${KALDI_ROOT}/tools/openfst/bin/ngramread"
+    echo "cd to $KALDI_ROOT/tools and run extras/install_opengrm.sh."
+    exit 1
+fi
+export LD_LIBRARY_PATH=${KALDI_ROOT}/tools/openfst/lib/fst
+
+# Baseline
+utils/format_lm.sh data/lang data/local/lm/lm_${lm}.arpa.gz \
+    data/local/dict/lexicon.txt data/lang_test_${lm}_base
+
+utils/mkgraph.sh --self-loop-scale 1.0 --remove-oov \
+    data/lang_test_${lm}_base ${am} ${am}/graph_${lm}_lookahead_base
+
+steps/nnet3/decode.sh --nj 20 \
+    --acwt 1.0 --post-decode-acwt 10.0 \
+    --online-ivector-dir exp/nnet3_cleaned/ivectors_${testset}_hires \
+    ${am}/graph_${lm}_lookahead_base data/${testset}_hires ${am}/decode_${testset}_lookahead_base
+
+utils/mkgraph_lookahead.sh --self-loop-scale 1.0 --remove-oov --compose-graph \
+    data/lang_test_${lm}_base ${am} ${am}/graph_${lm}_lookahead
+
+# Decode with statically composed lookahead graph
+steps/nnet3/decode.sh --nj 20 \
+    --acwt 1.0 --post-decode-acwt 10.0 \
+    --online-ivector-dir exp/nnet3_cleaned/ivectors_${testset}_hires \
+    ${am}/graph_${lm}_lookahead data/${testset}_hires ${am}/decode_${testset}_lookahead_static
+
+# Decode with runtime composition
+steps/nnet3/decode_lookahead.sh --nj 20 \
+    --acwt 1.0 --post-decode-acwt 10.0 \
+    --online-ivector-dir exp/nnet3_cleaned/ivectors_${testset}_hires \
+    ${am}/graph_${lm}_lookahead data/${testset}_hires ${am}/decode_${testset}_lookahead
+
+# Compile arpa graph
+utils/mkgraph_lookahead.sh --self-loop-scale 1.0 --compose-graph \
+    data/lang_test_${lm}_base ${am} data/local/lm/lm_tgmed.arpa.gz ${am}/graph_${lm}_lookahead_arpa
+
+# Decode with runtime composition
+steps/nnet3/decode_lookahead.sh --nj 20 \
+    --acwt 1.0 --post-decode-acwt 10.0 \
+    --online-ivector-dir exp/nnet3_cleaned/ivectors_${testset}_hires \
+    ${am}/graph_${lm}_lookahead_arpa data/${testset}_hires ${am}/decode_${testset}_lookahead_arpa
+
+# Decode with runtime composition and tuned beams
+steps/nnet3/decode_lookahead.sh --nj 20 \
+    --beam 12.0 --max-active 3000 \
+    --acwt 1.0 --post-decode-acwt 10.0 \
+    --online-ivector-dir exp/nnet3_cleaned/ivectors_${testset}_hires \
+    ${am}/graph_${lm}_lookahead_arpa data/${testset}_hires ${am}/decode_${testset}_lookahead_arpa_fast
diff --git a/egs/mini_librispeech/s5/local/lookahead/run_lookahead.sh b/egs/mini_librispeech/s5/local/lookahead/run_lookahead.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+
+. ./path.sh
+
+# Example script for lookahead composition
+
+lm=tgmed
+am=exp/chain_online_cmn/tdnn1k_sp
+testset=dev_clean_2
+
+# %WER 10.32 [ 2078 / 20138, 201 ins, 275 del, 1602 sub ] exp/chain_online_cmn/tdnn1k_sp/decode_dev_clean_2_lookahead_base/wer_10_0.5
+# %WER 10.29 [ 2073 / 20138, 200 ins, 272 del, 1601 sub ] exp/chain_online_cmn/tdnn1k_sp/decode_dev_clean_2_lookahead_static/wer_10_0.5
+# %WER 10.25 [ 2064 / 20138, 192 ins, 277 del, 1595 sub ] exp/chain_online_cmn/tdnn1k_sp/decode_dev_clean_2_lookahead/wer_10_0.5
+# %WER 10.24 [ 2063 / 20138, 187 ins, 290 del, 1586 sub ] exp/chain_online_cmn/tdnn1k_sp/decode_dev_clean_2_lookahead_arpa/wer_10_0.5
+# %WER 10.29 [ 2072 / 20138, 228 ins, 242 del, 1602 sub ] exp/chain_online_cmn/tdnn1k_sp/decode_dev_clean_2_lookahead_arpa_fast/wer_9_0.5
+
+# Speed
+#
+# base       0.29 xRT
+# static     0.31 xRT
+# lookahead  0.77 xRT
+# arpa       1.03 xRT
+# arpa_fast  0.31 xRT
+
+# Graph size
+#
+# Base                 461 Mb
+# Static               587 Mb
+# Lookahead            44 Mb HCL + 77 Mb Grammar
+# Lookahead + OpenGrm  44 Mb HCL + 42 Mb Grammar
+
+if [ ! -f "${KALDI_ROOT}/tools/openfst/lib/libfstlookahead.so" ]; then
+    echo "Missing ${KALDI_ROOT}/tools/openfst/lib/libfstlookahead.so"
+    echo "Make sure you compiled openfst with lookahead support. Run make in ${KALDI_ROOT}/tools after git pull."
+    exit 1
+fi
+if [ ! -f "${KALDI_ROOT}/tools/openfst/bin/ngramread" ]; then
+    echo "You appear to not have OpenGRM tools installed. Missing ${KALDI_ROOT}/tools/openfst/bin/ngramread"
+    echo "cd to $KALDI_ROOT/tools and run extras/install_opengrm.sh."
+    exit 1
+fi
+export LD_LIBRARY_PATH=${KALDI_ROOT}/tools/openfst/lib/fst
+
+# Baseline
+utils/format_lm.sh data/lang data/local/lm/lm_${lm}.arpa.gz \
+    data/local/dict/lexicon.txt data/lang_test_${lm}_base
+
+utils/mkgraph.sh --self-loop-scale 1.0 --remove-oov \
+    data/lang_test_${lm}_base ${am} ${am}/graph_${lm}_lookahead_base
+
+steps/nnet3/decode.sh --nj 20 \
+    --acwt 1.0 --post-decode-acwt 10.0 \
+    --online-ivector-dir exp/nnet3_online_cmn/ivectors_${testset}_hires \
+    ${am}/graph_${lm}_lookahead_base data/${testset}_hires ${am}/decode_${testset}_lookahead_base
+
+utils/mkgraph_lookahead.sh --self-loop-scale 1.0 --remove-oov --compose-graph \
+    data/lang_test_${lm}_base ${am} ${am}/graph_${lm}_lookahead
+
+# Decode with statically composed lookahead graph
+steps/nnet3/decode.sh --nj 20 \
+    --acwt 1.0 --post-decode-acwt 10.0 \
+    --online-ivector-dir exp/nnet3_online_cmn/ivectors_${testset}_hires \
+    ${am}/graph_${lm}_lookahead data/${testset}_hires ${am}/decode_${testset}_lookahead_static
+
+# Decode with runtime composition
+steps/nnet3/decode_lookahead.sh --nj 20 \
+    --acwt 1.0 --post-decode-acwt 10.0 \
+    --online-ivector-dir exp/nnet3_online_cmn/ivectors_${testset}_hires \
+    ${am}/graph_${lm}_lookahead data/${testset}_hires ${am}/decode_${testset}_lookahead
+
+# Compile arpa graph
+utils/mkgraph_lookahead.sh --self-loop-scale 1.0 --compose-graph \
+    data/lang_test_${lm}_base ${am} data/local/lm/lm_tgmed.arpa.gz ${am}/graph_${lm}_lookahead_arpa
+
+# Decode with runtime composition
+steps/nnet3/decode_lookahead.sh --nj 20 \
+    --acwt 1.0 --post-decode-acwt 10.0 \
+    --online-ivector-dir exp/nnet3_online_cmn/ivectors_${testset}_hires \
+    ${am}/graph_${lm}_lookahead_arpa data/${testset}_hires ${am}/decode_${testset}_lookahead_arpa
+
+# Decode with runtime composition and tuned beams
+steps/nnet3/decode_lookahead.sh --nj 20 \
+    --beam 12.0 --max-active 3000 \
+    --acwt 1.0 --post-decode-acwt 10.0 \
+    --online-ivector-dir exp/nnet3_online_cmn/ivectors_${testset}_hires \
+    ${am}/graph_${lm}_lookahead_arpa data/${testset}_hires ${am}/decode_${testset}_lookahead_arpa_fast