forked from kaldi-asr/kaldi
-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch '23-rebase-upstream-02-13-20' into 'master'
Resolve "Rebase upstream 02/13/20" Closes kaldi-asr#23 See merge request dl/dgx/kaldi!69
- Loading branch information
Showing
29 changed files
with
1,442 additions
and
144 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
tri1 | ||
%WER 40.91 [ 32272 / 78894, 2147 ins, 7478 del, 22647 sub ] exp/tri1/decode/wer_12_0.5 | ||
tri2b | ||
%WER 36.68 [ 28936 / 78894, 2752 ins, 5682 del, 20502 sub ] exp/tri2b/decode/wer_13_0.0 | ||
tri3b | ||
%WER 35.35 [ 27892 / 78894, 2587 ins, 7024 del, 18281 sub ] exp/tri3b/decode/wer_14_0.0 | ||
|
||
chain for dev set | ||
%WER 16.60 [ 13094 / 78894, 1314 ins, 2992 del, 8788 sub ] exp/chain/tdnn_1a_sp/decode_dev/wer_9_0.0 | ||
rnnlm-rescoring for dev set | ||
%WER 15.02 [ 11846 / 78894, 1248 ins, 2836 del, 7762 sub ] exp/chain/tdnn_1a_sp/decode_dev_rnnlm_1e_0.45/wer_9_0.0 | ||
|
||
chain for test_p2 set | ||
%WER 14.95 [ 10416 / 69668, 1129 ins, 2593 del, 6694 sub ] exp/chain/tdnn_1a_sp/decode_test_p2/wer_9_0.0 | ||
rnnlm-rescoring for test_p2 set | ||
%WER 13.51 [ 9413 / 69668, 1059 ins, 2517 del, 5837 sub ] exp/chain/tdnn_1a_sp/decode_test_p2_rnnlm_1e_0.45/wer_9_0.0 | ||
|
||
rnnlm-rescoring for mt_eval set | ||
%WER 12.02 [ 10829 / 90112, 1483 ins, 2401 del, 6945 sub ] exp/chain/tdnn_1a_sp/decode_mt_all_rnnlm_1e_0.45/wer_9_0.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
#!/bin/bash | ||
|
||
giga_dir=$1 | ||
|
||
source_dir=/export/corpora/LDC/LDC2011T11/arb_gw_5 | ||
num=2000000 | ||
suffix="2000k" | ||
|
||
[ ! -d $source_dir ] && echo "source Arabic Gigaword does not exist." && exit 1; | ||
|
||
[ -f $giga_dir/text ] && mv $giga_dir/text $giga_dir/text.bkp | ||
mkdir -p $giga_dir/ | ||
|
||
find $source_dir/data/ -name "*.gz" | while read file; do | ||
gunzip -c $file | local/arabic_convert.py - >> $giga_dir/text.arb | ||
done | ||
|
||
head -n $num $giga_dir/text.arb > $giga_dir/text.arb.${suffix} | ||
local/normalize_transcript_BW.pl $giga_dir/text.arb.${suffix} $giga_dir/text.${suffix} | ||
|
||
echo "finish preparing Arabic Gigaword" | ||
exit 0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
#!/bin/bash | ||
|
||
. ./path.sh | ||
|
||
# Example script for lookahead composition | ||
|
||
lm=tgmed | ||
am=exp/chain_cleaned/tdnn_1d_sp | ||
testset=test_clean | ||
|
||
# %WER 4.86 [ 2553 / 52576, 315 ins, 222 del, 2016 sub ] exp/chain_cleaned/tdnn_1d_sp/decode_test_clean_lookahead/wer_11_0.0 | ||
# %WER 4.79 [ 2518 / 52576, 279 ins, 292 del, 1947 sub ] exp/chain_cleaned/tdnn_1d_sp/decode_test_clean_lookahead_arpa/wer_11_0.0 | ||
# %WER 4.82 [ 2532 / 52576, 286 ins, 290 del, 1956 sub ] exp/chain_cleaned/tdnn_1d_sp/decode_test_clean_lookahead_arpa_fast/wer_11_0.0 | ||
# %WER 4.86 [ 2553 / 52576, 314 ins, 222 del, 2017 sub ] exp/chain_cleaned/tdnn_1d_sp/decode_test_clean_lookahead_base/wer_11_0.0 | ||
# %WER 4.86 [ 2553 / 52576, 315 ins, 222 del, 2016 sub ] exp/chain_cleaned/tdnn_1d_sp/decode_test_clean_lookahead_static/wer_11_0.0 | ||
|
||
|
||
# Speed | ||
# | ||
# base 0.18 xRT | ||
# static 0.18 xRT | ||
# lookahead 0.29 xRT | ||
# arpa 0.35 xRT | ||
# arpa_fast 0.21 xRT | ||
|
||
# Graph size | ||
# | ||
# Base 476 Mb | ||
# Static 621 Mb | ||
# Lookahead 48 Mb HCL + 77 Mb Grammar | ||
# Lookahead + OpenGrm 48 Mb HCL + 42 Mb Grammar | ||
|
||
if [ ! -f "${KALDI_ROOT}/tools/openfst/lib/libfstlookahead.so" ]; then | ||
echo "Missing ${KALDI_ROOT}/tools/openfst/lib/libfstlookahead.so" | ||
echo "Make sure you compiled openfst with lookahead support. Run make in ${KALDI_ROOT}/tools after git pull." | ||
exit 1 | ||
fi | ||
if [ ! -f "${KALDI_ROOT}/tools/openfst/bin/ngramread" ]; then | ||
echo "You appear to not have OpenGRM tools installed. Missing ${KALDI_ROOT}/tools/openfst/bin/ngramread" | ||
echo "cd to $KALDI_ROOT/tools and run extras/install_opengrm.sh." | ||
exit 1 | ||
fi | ||
export LD_LIBRARY_PATH=${KALDI_ROOT}/tools/openfst/lib/fst | ||
|
||
# Baseline | ||
utils/format_lm.sh data/lang data/local/lm/lm_${lm}.arpa.gz \ | ||
data/local/dict/lexicon.txt data/lang_test_${lm}_base | ||
|
||
utils/mkgraph.sh --self-loop-scale 1.0 --remove-oov \ | ||
data/lang_test_${lm}_base ${am} ${am}/graph_${lm}_lookahead_base | ||
|
||
steps/nnet3/decode.sh --nj 20 \ | ||
--acwt 1.0 --post-decode-acwt 10.0 \ | ||
--online-ivector-dir exp/nnet3_cleaned/ivectors_${testset}_hires \ | ||
${am}/graph_${lm}_lookahead_base data/${testset}_hires ${am}/decode_${testset}_lookahead_base | ||
|
||
utils/mkgraph_lookahead.sh --self-loop-scale 1.0 --remove-oov --compose-graph \ | ||
data/lang_test_${lm}_base ${am} ${am}/graph_${lm}_lookahead | ||
|
||
# Decode with statically composed lookahead graph | ||
steps/nnet3/decode.sh --nj 20 \ | ||
--acwt 1.0 --post-decode-acwt 10.0 \ | ||
--online-ivector-dir exp/nnet3_cleaned/ivectors_${testset}_hires \ | ||
${am}/graph_${lm}_lookahead data/${testset}_hires ${am}/decode_${testset}_lookahead_static | ||
|
||
# Decode with runtime composition | ||
steps/nnet3/decode_lookahead.sh --nj 20 \ | ||
--acwt 1.0 --post-decode-acwt 10.0 \ | ||
--online-ivector-dir exp/nnet3_cleaned/ivectors_${testset}_hires \ | ||
${am}/graph_${lm}_lookahead data/${testset}_hires ${am}/decode_${testset}_lookahead | ||
|
||
# Compile arpa graph | ||
utils/mkgraph_lookahead.sh --self-loop-scale 1.0 --compose-graph \ | ||
data/lang_test_${lm}_base ${am} data/local/lm/lm_tgmed.arpa.gz ${am}/graph_${lm}_lookahead_arpa | ||
|
||
# Decode with runtime composition | ||
steps/nnet3/decode_lookahead.sh --nj 20 \ | ||
--acwt 1.0 --post-decode-acwt 10.0 \ | ||
--online-ivector-dir exp/nnet3_cleaned/ivectors_${testset}_hires \ | ||
${am}/graph_${lm}_lookahead_arpa data/${testset}_hires ${am}/decode_${testset}_lookahead_arpa | ||
|
||
# Decode with runtime composition and tuned beams | ||
steps/nnet3/decode_lookahead.sh --nj 20 \ | ||
--beam 12.0 --max-active 3000 \ | ||
--acwt 1.0 --post-decode-acwt 10.0 \ | ||
--online-ivector-dir exp/nnet3_cleaned/ivectors_${testset}_hires \ | ||
${am}/graph_${lm}_lookahead_arpa data/${testset}_hires ${am}/decode_${testset}_lookahead_arpa_fast |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
#!/bin/bash | ||
|
||
. ./path.sh | ||
|
||
# Example script for lookahead composition | ||
|
||
lm=tgmed | ||
am=exp/chain_online_cmn/tdnn1k_sp | ||
testset=dev_clean_2 | ||
|
||
# %WER 10.32 [ 2078 / 20138, 201 ins, 275 del, 1602 sub ] exp/chain_online_cmn/tdnn1k_sp/decode_dev_clean_2_lookahead_base/wer_10_0.5 | ||
# %WER 10.29 [ 2073 / 20138, 200 ins, 272 del, 1601 sub ] exp/chain_online_cmn/tdnn1k_sp/decode_dev_clean_2_lookahead_static/wer_10_0.5 | ||
# %WER 10.25 [ 2064 / 20138, 192 ins, 277 del, 1595 sub ] exp/chain_online_cmn/tdnn1k_sp/decode_dev_clean_2_lookahead/wer_10_0.5 | ||
# %WER 10.24 [ 2063 / 20138, 187 ins, 290 del, 1586 sub ] exp/chain_online_cmn/tdnn1k_sp/decode_dev_clean_2_lookahead_arpa/wer_10_0.5 | ||
# %WER 10.29 [ 2072 / 20138, 228 ins, 242 del, 1602 sub ] exp/chain_online_cmn/tdnn1k_sp/decode_dev_clean_2_lookahead_arpa_fast/wer_9_0.5 | ||
|
||
# Speed | ||
# | ||
# base 0.29 xRT | ||
# static 0.31 xRT | ||
# lookahead 0.77 xRT | ||
# arpa 1.03 xRT | ||
# arpa_fast 0.31 xRT | ||
|
||
# Graph size | ||
# | ||
# Base 461 Mb | ||
# Static 587 Mb | ||
# Lookahead 44 Mb HCL + 77 Mb Grammar | ||
# Lookahead + OpenGrm 44 Mb HCL + 42 Mb Grammar | ||
|
||
if [ ! -f "${KALDI_ROOT}/tools/openfst/lib/libfstlookahead.so" ]; then | ||
echo "Missing ${KALDI_ROOT}/tools/openfst/lib/libfstlookahead.so" | ||
echo "Make sure you compiled openfst with lookahead support. Run make in ${KALDI_ROOT}/tools after git pull." | ||
exit 1 | ||
fi | ||
if [ ! -f "${KALDI_ROOT}/tools/openfst/bin/ngramread" ]; then | ||
echo "You appear to not have OpenGRM tools installed. Missing ${KALDI_ROOT}/tools/openfst/bin/ngramread" | ||
echo "cd to $KALDI_ROOT/tools and run extras/install_opengrm.sh." | ||
exit 1 | ||
fi | ||
export LD_LIBRARY_PATH=${KALDI_ROOT}/tools/openfst/lib/fst | ||
|
||
# Baseline | ||
utils/format_lm.sh data/lang data/local/lm/lm_${lm}.arpa.gz \ | ||
data/local/dict/lexicon.txt data/lang_test_${lm}_base | ||
|
||
utils/mkgraph.sh --self-loop-scale 1.0 --remove-oov \ | ||
data/lang_test_${lm}_base ${am} ${am}/graph_${lm}_lookahead_base | ||
|
||
steps/nnet3/decode.sh --nj 20 \ | ||
--acwt 1.0 --post-decode-acwt 10.0 \ | ||
--online-ivector-dir exp/nnet3_online_cmn/ivectors_${testset}_hires \ | ||
${am}/graph_${lm}_lookahead_base data/${testset}_hires ${am}/decode_${testset}_lookahead_base | ||
|
||
utils/mkgraph_lookahead.sh --self-loop-scale 1.0 --remove-oov --compose-graph \ | ||
data/lang_test_${lm}_base ${am} ${am}/graph_${lm}_lookahead | ||
|
||
# Decode with statically composed lookahead graph | ||
steps/nnet3/decode.sh --nj 20 \ | ||
--acwt 1.0 --post-decode-acwt 10.0 \ | ||
--online-ivector-dir exp/nnet3_online_cmn/ivectors_${testset}_hires \ | ||
${am}/graph_${lm}_lookahead data/${testset}_hires ${am}/decode_${testset}_lookahead_static | ||
|
||
# Decode with runtime composition | ||
steps/nnet3/decode_lookahead.sh --nj 20 \ | ||
--acwt 1.0 --post-decode-acwt 10.0 \ | ||
--online-ivector-dir exp/nnet3_online_cmn/ivectors_${testset}_hires \ | ||
${am}/graph_${lm}_lookahead data/${testset}_hires ${am}/decode_${testset}_lookahead | ||
|
||
# Compile arpa graph | ||
utils/mkgraph_lookahead.sh --self-loop-scale 1.0 --compose-graph \ | ||
data/lang_test_${lm}_base ${am} data/local/lm/lm_tgmed.arpa.gz ${am}/graph_${lm}_lookahead_arpa | ||
|
||
# Decode with runtime composition | ||
steps/nnet3/decode_lookahead.sh --nj 20 \ | ||
--acwt 1.0 --post-decode-acwt 10.0 \ | ||
--online-ivector-dir exp/nnet3_online_cmn/ivectors_${testset}_hires \ | ||
${am}/graph_${lm}_lookahead_arpa data/${testset}_hires ${am}/decode_${testset}_lookahead_arpa | ||
|
||
# Decode with runtime composition and tuned beams | ||
steps/nnet3/decode_lookahead.sh --nj 20 \ | ||
--beam 12.0 --max-active 3000 \ | ||
--acwt 1.0 --post-decode-acwt 10.0 \ | ||
--online-ivector-dir exp/nnet3_online_cmn/ivectors_${testset}_hires \ | ||
${am}/graph_${lm}_lookahead_arpa data/${testset}_hires ${am}/decode_${testset}_lookahead_arpa_fast |
Oops, something went wrong.