Skip to content

Commit

Permalink
Merge branch '23-rebase-upstream-02-13-20' into 'master'
Browse files Browse the repository at this point in the history
Resolve "Rebase upstream 02/13/20"

Closes kaldi-asr#23

See merge request dl/dgx/kaldi!69
  • Loading branch information
hugovbraun committed Feb 14, 2020
2 parents a2271cc + de6d8f0 commit dc73e74
Show file tree
Hide file tree
Showing 29 changed files with 1,442 additions and 144 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,8 @@ GSYMS
/tools/cub-1.8.0/
/tools/cub
/tools/python/
/tools/ngram-1.3.7.tar.gz
/tools/ngram-1.3.7/

# These CMakeLists.txt files are all genareted on the fly at the moment.
# They are added here to avoid accidently checkin.
Expand Down
19 changes: 19 additions & 0 deletions egs/gale_arabic/s5d/RESULTS
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
tri1
%WER 40.91 [ 32272 / 78894, 2147 ins, 7478 del, 22647 sub ] exp/tri1/decode/wer_12_0.5
tri2b
%WER 36.68 [ 28936 / 78894, 2752 ins, 5682 del, 20502 sub ] exp/tri2b/decode/wer_13_0.0
tri3b
%WER 35.35 [ 27892 / 78894, 2587 ins, 7024 del, 18281 sub ] exp/tri3b/decode/wer_14_0.0

chain for dev set
%WER 16.60 [ 13094 / 78894, 1314 ins, 2992 del, 8788 sub ] exp/chain/tdnn_1a_sp/decode_dev/wer_9_0.0
rnnlm-rescoring for dev set
%WER 15.02 [ 11846 / 78894, 1248 ins, 2836 del, 7762 sub ] exp/chain/tdnn_1a_sp/decode_dev_rnnlm_1e_0.45/wer_9_0.0

chain for test_p2 set
%WER 14.95 [ 10416 / 69668, 1129 ins, 2593 del, 6694 sub ] exp/chain/tdnn_1a_sp/decode_test_p2/wer_9_0.0
rnnlm-rescoring for test_p2 set
%WER 13.51 [ 9413 / 69668, 1059 ins, 2517 del, 5837 sub ] exp/chain/tdnn_1a_sp/decode_test_p2_rnnlm_1e_0.45/wer_9_0.0

rnnlm-rescoring for mt_eval set
%WER 12.02 [ 10829 / 90112, 1483 ins, 2401 del, 6945 sub ] exp/chain/tdnn_1a_sp/decode_mt_all_rnnlm_1e_0.45/wer_9_0.0
70 changes: 35 additions & 35 deletions egs/gale_arabic/s5d/local/gale_train_lms.sh
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,7 @@ lexicon=$2 # data/local/dict/lexicon.txt
dir=$3 # data/local/lm

shift 3
giga_dir=( $@ )
[ -z $giga_dir ] && echo "Training LM without using external Arabic Gigaword."
giga_dirs=( $@ )

for f in "$text" "$lexicon"; do
[ ! -f $x ] && echo "$0: No such file $f" && exit 1;
Expand Down Expand Up @@ -95,43 +94,44 @@ if [ $stage -le 1 ]; then
echo "training 4-gram lm"
ngram-count -text $dir/train.gz -order 4 -limit-vocab -vocab $dir/wordlist \
-unk -map-unk "<UNK>" -${smoothing}discount -interpolate -lm $dir/gale.o4g.${smoothing}.gz
echo "PPL for SWBD1 4gram LM:"
echo "PPL for GALE Arabic 4gram LM:"
ngram -unk -lm $dir/gale.o4g.${smoothing}.gz -ppl $dir/heldout
ngram -unk -lm $dir/gale.o4g.${smoothing}.gz -ppl $dir/heldout -debug 2 >& $dir/4gram.${smoothing}.ppl2
fi

if [ ! -z $giga_dirs ]; then
mkdir -p $dir/giga
if [ ! -f $giga_dirs/text.2000k ]; then
echo "Arabic Gigaword text not found, prepare it"
local/prepare_giga.sh $giga_dirs
fi

if [ $stage -le 2 ]; then
if [ ! -z $giga_dir ]; then
echo "Using external data."
mkdir -p $dir/giga
cp $giga_dir/text.2000k $dir/giga
cat $dir/giga/text.2000k | gzip -c > $dir/giga/text2000k.gz
cp $giga_dirs/text.2000k $dir/giga
cat $dir/giga/text.2000k | gzip -c > $dir/giga/text2000k.gz

for x in 3 4; do
smoothing="kn"
ngram-count -text $dir/giga/text2000k.gz -order $x -limit-vocab \
-vocab $dir/wordlist -unk -map-unk "<UNK>" -${smoothing}discount -interpolate \
-lm $dir/giga/giga.o${x}g.${smoothing}.gz
echo "PPL for Gigaword ${x}gram LM:"
ngram -unk -lm $dir/giga/giga.o${x}g.${smoothing}.gz -ppl $dir/heldout
ngram -unk -lm $dir/giga/giga.o${x}g.${smoothing}.gz -ppl $dir/heldout -debug 2 \
>& $dir/giga/${x}gram.${smoothing}.ppl2
compute-best-mix $dir/${x}gram.${smoothing}.ppl2 \
$dir/giga/${x}gram.${smoothing}.ppl2 >& $dir/gale_giga_mix.${x}gram.${smoothing}.log
grep 'best lambda' $dir/gale_giga_mix.${x}gram.${smoothing}.log | perl -e '
$_=<>;
s/.*\(//; s/\).*//;
@A = split;
die "Expecting 2 numbers; found: $_" if(@A!=2);
print "$A[0]\n$A[1]\n";' > $dir/gale_giga_mix.${x}gram.${smoothing}.weights
gale_weight=$(head -1 $dir/gale_giga_mix.${x}gram.${smoothing}.weights)
giga_weight=$(tail -n 1 $dir/gale_giga_mix.${x}gram.${smoothing}.weights)
ngram -order $x -lm $dir/gale.o${x}g.${smoothing}.gz -lambda $swb1_weight \
-mix-lm $dir/giga/giga.o${x}g.${smoothing}.gz \
-unk -write-lm $dir/gale_giga.o${x}g.${smoothing}.gz
echo "PPL for GALE + Gigaword ${x}gram LM:"
ngram -unk -lm $dir/gale_giga.o${x}g.${smoothing}.gz -ppl $dir/heldout
done
fi
for x in 3 4; do
smoothing="kn"
ngram-count -text $dir/giga/text2000k.gz -order $x -limit-vocab \
-vocab $dir/wordlist -unk -map-unk "<UNK>" -${smoothing}discount -interpolate \
-lm $dir/giga/giga.o${x}g.${smoothing}.gz
echo "PPL for Gigaword ${x}gram LM:"
ngram -unk -lm $dir/giga/giga.o${x}g.${smoothing}.gz -ppl $dir/heldout
ngram -unk -lm $dir/giga/giga.o${x}g.${smoothing}.gz -ppl $dir/heldout -debug 2 \
>& $dir/giga/${x}gram.${smoothing}.ppl2
compute-best-mix $dir/${x}gram.${smoothing}.ppl2 \
$dir/giga/${x}gram.${smoothing}.ppl2 >& $dir/gale_giga_mix.${x}gram.${smoothing}.log
grep 'best lambda' $dir/gale_giga_mix.${x}gram.${smoothing}.log | perl -e '
$_=<>;
s/.*\(//; s/\).*//;
@A = split;
die "Expecting 2 numbers; found: $_" if(@A!=2);
print "$A[0]\n$A[1]\n";' > $dir/gale_giga_mix.${x}gram.${smoothing}.weights
gale_weight=$(head -1 $dir/gale_giga_mix.${x}gram.${smoothing}.weights)
giga_weight=$(tail -n 1 $dir/gale_giga_mix.${x}gram.${smoothing}.weights)
ngram -order $x -lm $dir/gale.o${x}g.${smoothing}.gz -lambda $swb1_weight \
-mix-lm $dir/giga/giga.o${x}g.${smoothing}.gz \
-unk -write-lm $dir/gale_giga.o${x}g.${smoothing}.gz
echo "PPL for GALE + Gigaword ${x}gram LM:"
ngram -unk -lm $dir/gale_giga.o${x}g.${smoothing}.gz -ppl $dir/heldout
done
fi
22 changes: 22 additions & 0 deletions egs/gale_arabic/s5d/local/prepare_giga.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/bin/bash

giga_dir=$1

source_dir=/export/corpora/LDC/LDC2011T11/arb_gw_5
num=2000000
suffix="2000k"

[ ! -d $source_dir ] && echo "source Arabic Gigaword does not exist." && exit 1;

[ -f $giga_dir/text ] && mv $giga_dir/text $giga_dir/text.bkp
mkdir -p $giga_dir/

find $source_dir/data/ -name "*.gz" | while read file; do
gunzip -c $file | local/arabic_convert.py - >> $giga_dir/text.arb
done

head -n $num $giga_dir/text.arb > $giga_dir/text.arb.${suffix}
local/normalize_transcript_BW.pl $giga_dir/text.arb.${suffix} $giga_dir/text.${suffix}

echo "finish preparing Arabic Gigaword"
exit 0
87 changes: 87 additions & 0 deletions egs/librispeech/s5/local/lookahead/run_lookahead.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
#!/bin/bash

. ./path.sh

# Example script for lookahead composition

lm=tgmed
am=exp/chain_cleaned/tdnn_1d_sp
testset=test_clean

# %WER 4.86 [ 2553 / 52576, 315 ins, 222 del, 2016 sub ] exp/chain_cleaned/tdnn_1d_sp/decode_test_clean_lookahead/wer_11_0.0
# %WER 4.79 [ 2518 / 52576, 279 ins, 292 del, 1947 sub ] exp/chain_cleaned/tdnn_1d_sp/decode_test_clean_lookahead_arpa/wer_11_0.0
# %WER 4.82 [ 2532 / 52576, 286 ins, 290 del, 1956 sub ] exp/chain_cleaned/tdnn_1d_sp/decode_test_clean_lookahead_arpa_fast/wer_11_0.0
# %WER 4.86 [ 2553 / 52576, 314 ins, 222 del, 2017 sub ] exp/chain_cleaned/tdnn_1d_sp/decode_test_clean_lookahead_base/wer_11_0.0
# %WER 4.86 [ 2553 / 52576, 315 ins, 222 del, 2016 sub ] exp/chain_cleaned/tdnn_1d_sp/decode_test_clean_lookahead_static/wer_11_0.0


# Speed
#
# base 0.18 xRT
# static 0.18 xRT
# lookahead 0.29 xRT
# arpa 0.35 xRT
# arpa_fast 0.21 xRT

# Graph size
#
# Base 476 Mb
# Static 621 Mb
# Lookahead 48 Mb HCL + 77 Mb Grammar
# Lookahead + OpenGrm 48 Mb HCL + 42 Mb Grammar

if [ ! -f "${KALDI_ROOT}/tools/openfst/lib/libfstlookahead.so" ]; then
echo "Missing ${KALDI_ROOT}/tools/openfst/lib/libfstlookahead.so"
echo "Make sure you compiled openfst with lookahead support. Run make in ${KALDI_ROOT}/tools after git pull."
exit 1
fi
if [ ! -f "${KALDI_ROOT}/tools/openfst/bin/ngramread" ]; then
echo "You appear to not have OpenGRM tools installed. Missing ${KALDI_ROOT}/tools/openfst/bin/ngramread"
echo "cd to $KALDI_ROOT/tools and run extras/install_opengrm.sh."
exit 1
fi
export LD_LIBRARY_PATH=${KALDI_ROOT}/tools/openfst/lib/fst

# Baseline
utils/format_lm.sh data/lang data/local/lm/lm_${lm}.arpa.gz \
data/local/dict/lexicon.txt data/lang_test_${lm}_base

utils/mkgraph.sh --self-loop-scale 1.0 --remove-oov \
data/lang_test_${lm}_base ${am} ${am}/graph_${lm}_lookahead_base

steps/nnet3/decode.sh --nj 20 \
--acwt 1.0 --post-decode-acwt 10.0 \
--online-ivector-dir exp/nnet3_cleaned/ivectors_${testset}_hires \
${am}/graph_${lm}_lookahead_base data/${testset}_hires ${am}/decode_${testset}_lookahead_base

utils/mkgraph_lookahead.sh --self-loop-scale 1.0 --remove-oov --compose-graph \
data/lang_test_${lm}_base ${am} ${am}/graph_${lm}_lookahead

# Decode with statically composed lookahead graph
steps/nnet3/decode.sh --nj 20 \
--acwt 1.0 --post-decode-acwt 10.0 \
--online-ivector-dir exp/nnet3_cleaned/ivectors_${testset}_hires \
${am}/graph_${lm}_lookahead data/${testset}_hires ${am}/decode_${testset}_lookahead_static

# Decode with runtime composition
steps/nnet3/decode_lookahead.sh --nj 20 \
--acwt 1.0 --post-decode-acwt 10.0 \
--online-ivector-dir exp/nnet3_cleaned/ivectors_${testset}_hires \
${am}/graph_${lm}_lookahead data/${testset}_hires ${am}/decode_${testset}_lookahead

# Compile arpa graph
utils/mkgraph_lookahead.sh --self-loop-scale 1.0 --compose-graph \
data/lang_test_${lm}_base ${am} data/local/lm/lm_tgmed.arpa.gz ${am}/graph_${lm}_lookahead_arpa

# Decode with runtime composition
steps/nnet3/decode_lookahead.sh --nj 20 \
--acwt 1.0 --post-decode-acwt 10.0 \
--online-ivector-dir exp/nnet3_cleaned/ivectors_${testset}_hires \
${am}/graph_${lm}_lookahead_arpa data/${testset}_hires ${am}/decode_${testset}_lookahead_arpa

# Decode with runtime composition and tuned beams
steps/nnet3/decode_lookahead.sh --nj 20 \
--beam 12.0 --max-active 3000 \
--acwt 1.0 --post-decode-acwt 10.0 \
--online-ivector-dir exp/nnet3_cleaned/ivectors_${testset}_hires \
${am}/graph_${lm}_lookahead_arpa data/${testset}_hires ${am}/decode_${testset}_lookahead_arpa_fast
86 changes: 86 additions & 0 deletions egs/mini_librispeech/s5/local/lookahead/run_lookahead.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
#!/bin/bash

. ./path.sh

# Example script for lookahead composition

lm=tgmed
am=exp/chain_online_cmn/tdnn1k_sp
testset=dev_clean_2

# %WER 10.32 [ 2078 / 20138, 201 ins, 275 del, 1602 sub ] exp/chain_online_cmn/tdnn1k_sp/decode_dev_clean_2_lookahead_base/wer_10_0.5
# %WER 10.29 [ 2073 / 20138, 200 ins, 272 del, 1601 sub ] exp/chain_online_cmn/tdnn1k_sp/decode_dev_clean_2_lookahead_static/wer_10_0.5
# %WER 10.25 [ 2064 / 20138, 192 ins, 277 del, 1595 sub ] exp/chain_online_cmn/tdnn1k_sp/decode_dev_clean_2_lookahead/wer_10_0.5
# %WER 10.24 [ 2063 / 20138, 187 ins, 290 del, 1586 sub ] exp/chain_online_cmn/tdnn1k_sp/decode_dev_clean_2_lookahead_arpa/wer_10_0.5
# %WER 10.29 [ 2072 / 20138, 228 ins, 242 del, 1602 sub ] exp/chain_online_cmn/tdnn1k_sp/decode_dev_clean_2_lookahead_arpa_fast/wer_9_0.5

# Speed
#
# base 0.29 xRT
# static 0.31 xRT
# lookahead 0.77 xRT
# arpa 1.03 xRT
# arpa_fast 0.31 xRT

# Graph size
#
# Base 461 Mb
# Static 587 Mb
# Lookahead 44 Mb HCL + 77 Mb Grammar
# Lookahead + OpenGrm 44 Mb HCL + 42 Mb Grammar

if [ ! -f "${KALDI_ROOT}/tools/openfst/lib/libfstlookahead.so" ]; then
echo "Missing ${KALDI_ROOT}/tools/openfst/lib/libfstlookahead.so"
echo "Make sure you compiled openfst with lookahead support. Run make in ${KALDI_ROOT}/tools after git pull."
exit 1
fi
if [ ! -f "${KALDI_ROOT}/tools/openfst/bin/ngramread" ]; then
echo "You appear to not have OpenGRM tools installed. Missing ${KALDI_ROOT}/tools/openfst/bin/ngramread"
echo "cd to $KALDI_ROOT/tools and run extras/install_opengrm.sh."
exit 1
fi
export LD_LIBRARY_PATH=${KALDI_ROOT}/tools/openfst/lib/fst

# Baseline
utils/format_lm.sh data/lang data/local/lm/lm_${lm}.arpa.gz \
data/local/dict/lexicon.txt data/lang_test_${lm}_base

utils/mkgraph.sh --self-loop-scale 1.0 --remove-oov \
data/lang_test_${lm}_base ${am} ${am}/graph_${lm}_lookahead_base

steps/nnet3/decode.sh --nj 20 \
--acwt 1.0 --post-decode-acwt 10.0 \
--online-ivector-dir exp/nnet3_online_cmn/ivectors_${testset}_hires \
${am}/graph_${lm}_lookahead_base data/${testset}_hires ${am}/decode_${testset}_lookahead_base

utils/mkgraph_lookahead.sh --self-loop-scale 1.0 --remove-oov --compose-graph \
data/lang_test_${lm}_base ${am} ${am}/graph_${lm}_lookahead

# Decode with statically composed lookahead graph
steps/nnet3/decode.sh --nj 20 \
--acwt 1.0 --post-decode-acwt 10.0 \
--online-ivector-dir exp/nnet3_online_cmn/ivectors_${testset}_hires \
${am}/graph_${lm}_lookahead data/${testset}_hires ${am}/decode_${testset}_lookahead_static

# Decode with runtime composition
steps/nnet3/decode_lookahead.sh --nj 20 \
--acwt 1.0 --post-decode-acwt 10.0 \
--online-ivector-dir exp/nnet3_online_cmn/ivectors_${testset}_hires \
${am}/graph_${lm}_lookahead data/${testset}_hires ${am}/decode_${testset}_lookahead

# Compile arpa graph
utils/mkgraph_lookahead.sh --self-loop-scale 1.0 --compose-graph \
data/lang_test_${lm}_base ${am} data/local/lm/lm_tgmed.arpa.gz ${am}/graph_${lm}_lookahead_arpa

# Decode with runtime composition
steps/nnet3/decode_lookahead.sh --nj 20 \
--acwt 1.0 --post-decode-acwt 10.0 \
--online-ivector-dir exp/nnet3_online_cmn/ivectors_${testset}_hires \
${am}/graph_${lm}_lookahead_arpa data/${testset}_hires ${am}/decode_${testset}_lookahead_arpa

# Decode with runtime composition and tuned beams
steps/nnet3/decode_lookahead.sh --nj 20 \
--beam 12.0 --max-active 3000 \
--acwt 1.0 --post-decode-acwt 10.0 \
--online-ivector-dir exp/nnet3_online_cmn/ivectors_${testset}_hires \
${am}/graph_${lm}_lookahead_arpa data/${testset}_hires ${am}/decode_${testset}_lookahead_arpa_fast
Loading

0 comments on commit dc73e74

Please sign in to comment.