diff --git a/.gitignore b/.gitignore index cdcd13ec8b5..7317468dba0 100644 --- a/.gitignore +++ b/.gitignore @@ -73,10 +73,10 @@ GSYMS /src/kaldi.mk.bak # /egs/ -/egs/*/s*/mfcc -/egs/*/s*/plp -/egs/*/s*/exp -/egs/*/s*/data +/egs/*/*/mfcc +/egs/*/*/plp +/egs/*/*/exp +/egs/*/*/data # /tools/ /tools/pocolm/ diff --git a/egs/bentham/README.txt b/egs/bentham/README.txt new file mode 100644 index 00000000000..02870c265f6 --- /dev/null +++ b/egs/bentham/README.txt @@ -0,0 +1,5 @@ +This directory contains example scripts for handwriting recognition on +the Bentham dataset: +http://www.transcriptorium.eu/~htrcontest/contestICFHR2014/public_html/ +In the ICFHR 2014 contest, the best performing system in the unrestricted +track obtained a WER of 8.6%. diff --git a/egs/bentham/v1/cmd.sh b/egs/bentham/v1/cmd.sh new file mode 100755 index 00000000000..3c8eb9f93a5 --- /dev/null +++ b/egs/bentham/v1/cmd.sh @@ -0,0 +1,13 @@ +# you can change cmd.sh depending on what type of queue you are using. +# If you have no queueing system and want to run on a local machine, you +# can change all instances 'queue.pl' to run.pl (but be careful and run +# commands one by one: most recipes will exhaust the memory on your +# machine). queue.pl works with GridEngine (qsub). slurm.pl works +# with slurm. Different queues are configured differently, with different +# queue names and different ways of specifying things like memory; +# to account for these differences you can create and edit the file +# conf/queue.conf to match your queue's configuration. Search for +# conf/queue.conf in http://kaldi-asr.org/doc/queue.html for more information, +# or search for the string 'default_config' in utils/queue.pl or utils/slurm.pl. + +export cmd="queue.pl" diff --git a/egs/bentham/v1/image b/egs/bentham/v1/image new file mode 120000 index 00000000000..6a4b3afeb09 --- /dev/null +++ b/egs/bentham/v1/image @@ -0,0 +1 @@ +../../cifar/v1/image \ No newline at end of file diff --git a/egs/bentham/v1/local/chain/compare_wer.sh b/egs/bentham/v1/local/chain/compare_wer.sh new file mode 100755 index 00000000000..2ce14e13694 --- /dev/null +++ b/egs/bentham/v1/local/chain/compare_wer.sh @@ -0,0 +1,120 @@ +#!/bin/bash + +# this script is used for comparing decoding results between systems. +# e.g. local/chain/compare_wer.sh exp/chain/cnn{1a,1b} + +# Copyright 2017 Chun Chieh Chang +# 2017 Ashish Arora + +if [ $# == 0 ]; then + echo "Usage: $0: [ ... ]" + echo "e.g.: $0 exp/chain/cnn{1a,1b}" + exit 1 +fi +. ./path.sh + +echo "# $0 $*" +used_epochs=false + +echo -n "# System " +for x in $*; do printf "% 10s" " $(basename $x)"; done +echo + +echo -n "# WER " +for x in $*; do + wer=$(cat $x/decode_test/scoring_kaldi/best_wer | awk '{print $2}') + printf "% 10s" $wer +done +echo + +echo -n "# WER (rescored) " +for x in $*; do + wer="--" + [ -d $x/decode_test_rescored ] && wer=$(cat $x/decode_test_rescored/scoring_kaldi/best_wer | awk '{print $2}') + printf "% 10s" $wer +done +echo + +echo -n "# CER " +for x in $*; do + cer=$(cat $x/decode_test/scoring_kaldi/best_cer | awk '{print $2}') + printf "% 10s" $cer +done +echo + +echo -n "# CER (rescored) " +for x in $*; do + cer="--" + [ -d $x/decode_test_rescored ] && cer=$(cat $x/decode_test_rescored/scoring_kaldi/best_cer | awk '{print $2}') + printf "% 10s" $cer +done +echo + +echo -n "# WER val " +for x in $*; do + wer=$(cat $x/decode_val/scoring_kaldi/best_wer | awk '{print $2}') + printf "% 10s" $wer +done +echo + +echo -n "# WER (rescored) val " +for x in $*; do + wer="--" + [ -d $x/decode_val_rescored ] && wer=$(cat $x/decode_val_rescored/scoring_kaldi/best_wer | awk '{print $2}') + printf "% 10s" $wer +done +echo + +echo -n "# CER val " +for x in $*; do + cer=$(cat $x/decode_val/scoring_kaldi/best_cer | awk '{print $2}') + printf "% 10s" $cer +done +echo + +echo -n "# CER (rescored) val " +for x in $*; do + cer="--" + [ -d $x/decode_val_rescored ] && cer=$(cat $x/decode_val_rescored/scoring_kaldi/best_cer | awk '{print $2}') + printf "% 10s" $cer +done +echo + +if $used_epochs; then + exit 0; # the diagnostics aren't comparable between regular and discriminatively trained systems. +fi + +echo -n "# Final train prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -v xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final train prob (xent) " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_train.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Final valid prob (xent) " +for x in $*; do + prob=$(grep Overall $x/log/compute_prob_valid.final.log | grep -w xent | awk '{printf("%.4f", $8)}') + printf "% 10s" $prob +done +echo + +echo -n "# Parameters " +for x in $*; do + params=$(nnet3-info $x/final.mdl 2>/dev/null | grep num-parameters | cut -d' ' -f2 | awk '{printf "%0.2fM\n",$1/1000000}') + printf "% 10s" $params +done +echo diff --git a/egs/bentham/v1/local/chain/run_cnn_e2eali.sh b/egs/bentham/v1/local/chain/run_cnn_e2eali.sh new file mode 120000 index 00000000000..e2545b0186e --- /dev/null +++ b/egs/bentham/v1/local/chain/run_cnn_e2eali.sh @@ -0,0 +1 @@ +tuning/run_cnn_e2eali_1a.sh \ No newline at end of file diff --git a/egs/bentham/v1/local/chain/run_e2e_cnn.sh b/egs/bentham/v1/local/chain/run_e2e_cnn.sh new file mode 120000 index 00000000000..d26ba0182ce --- /dev/null +++ b/egs/bentham/v1/local/chain/run_e2e_cnn.sh @@ -0,0 +1 @@ +tuning/run_e2e_cnn_1a.sh \ No newline at end of file diff --git a/egs/bentham/v1/local/chain/tuning/run_cnn_e2eali_1a.sh b/egs/bentham/v1/local/chain/tuning/run_cnn_e2eali_1a.sh new file mode 100755 index 00000000000..6bac5a22398 --- /dev/null +++ b/egs/bentham/v1/local/chain/tuning/run_cnn_e2eali_1a.sh @@ -0,0 +1,261 @@ +#!/bin/bash + +# local/chain/compare_wer.sh exp/chain/e2e_cnn_1a/ exp/chain/cnn_e2eali_1a +# System e2e_cnn_1a cnn_e2eali_1a +# WER 13.72 8.14 +# WER (rescored) 13.40 8.00 +# CER 6.56 2.82 +# CER (rescored) 6.33 2.73 +# WER val 13.51 8.19 +# WER (rescored) val 13.38 7.97 +# CER val 6.40 2.93 +# CER (rescored) val 6.29 2.90 +# Final train prob 0.1037 -0.0613 +# Final valid prob 0.0720 -0.0988 +# Final train prob (xent) -0.3706 +# Final valid prob (xent) -0.4669 +# Parameters 11.54M 4.29M + +# steps/info/chain_dir_info.pl exp/chain/cnn_e2eali_1a +# exp/chain/cnn_e2eali_1a: num-iters=20 nj=3..5 num-params=4.3M dim=40->336 combine=-0.066->-0.066 (over 1) xent:train/valid[12,19,final]=(-0.822,-0.437,-0.371/-0.859,-0.514,-0.467) logprob:train/valid[12,19,final]=(-0.188,-0.078,-0.061/-0.204,-0.114,-0.099) + +set -e -o pipefail + +stage=0 + +nj=30 +train_set=train +decode_val=true +nnet3_affix= # affix for exp dirs, e.g. it was _cleaned in tedlium. +affix=_1a #affix for TDNN+LSTM directory e.g. "1a" or "1b", in case we change the configuration. +e2echain_model_dir=exp/chain/e2e_cnn_1a +common_egs_dir= +reporting_email= + +# chain options +train_stage=-10 +xent_regularize=0.1 +frame_subsampling_factor=4 +# training chunk-options +chunk_width=340,300,200,100 +num_leaves=500 +# we don't need extra left/right context for TDNN systems. +chunk_left_context=0 +chunk_right_context=0 +tdnn_dim=550 +# training options +srand=0 +remove_egs=true +lang_decode=data/lang +lang_rescore=data/lang_rescore_6g +if $decode_val; then maybe_val=val; else maybe_val= ; fi +dropout_schedule='0,0@0.20,0.2@0.50,0' +# End configuration section. +echo "$0 $@" # Print the command line for logging + + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + + +if ! cuda-compiled; then + cat <$lang/topo + fi +fi + +if [ $stage -le 2 ]; then + # Get the alignments as lattices (gives the chain training more freedom). + # use the same num-jobs as the alignments + steps/nnet3/align_lats.sh --nj $nj --cmd "$cmd" \ + --acoustic-scale 1.0 \ + --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0' \ + ${train_data_dir} data/lang $e2echain_model_dir $lat_dir + echo "" >$lat_dir/splice_opts +fi + +if [ $stage -le 3 ]; then + # Build a tree using our new topology. We know we have alignments for the + # speed-perturbed data (local/nnet3/run_ivector_common.sh made them), so use + # those. The num-leaves is always somewhat less than the num-leaves from + # the GMM baseline. + if [ -f $tree_dir/final.mdl ]; then + echo "$0: $tree_dir/final.mdl already exists, refusing to overwrite it." + exit 1; + fi + + steps/nnet3/chain/build_tree.sh \ + --frame-subsampling-factor $frame_subsampling_factor \ + --alignment-subsampling-factor 1 \ + --context-opts "--context-width=2 --central-position=1" \ + --cmd "$cmd" $num_leaves ${train_data_dir} \ + $lang $ali_dir $tree_dir +fi + + +if [ $stage -le 4 ]; then + mkdir -p $dir + echo "$0: creating neural net configs using the xconfig parser"; + + num_targets=$(tree-info $tree_dir/tree | grep num-pdfs | awk '{print $2}') + learning_rate_factor=$(echo "print 0.5/$xent_regularize" | python) + cnn_opts="l2-regularize=0.03 dropout-proportion=0.0" + tdnn_opts="l2-regularize=0.03" + output_opts="l2-regularize=0.04" + common1="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="$cnn_opts required-time-offsets= height-offsets=-2,-1,0,1,2 num-filters-out=70" + common3="$cnn_opts required-time-offsets= height-offsets=-1,0,1 num-filters-out=70" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + + conv-relu-batchnorm-dropout-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-dropout-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-dropout-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-dropout-layer name=cnn4 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-dropout-layer name=cnn5 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common3 height-subsample-out=2 + conv-relu-batchnorm-dropout-layer name=cnn6 height-in=10 height-out=10 time-offsets=-4,0,4 $common3 + relu-batchnorm-dropout-layer name=tdnn1 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0 + relu-batchnorm-dropout-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0 + relu-batchnorm-dropout-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim $tdnn_opts dropout-proportion=0.0 + + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts + + # adding the layers for xent branch + # This block prints the configs for a separate output that will be + # trained with a cross-entropy objective in the 'chain' mod?els... this + # has the effect of regularizing the hidden parts of the model. we use + # 0.5 / args.xent_regularize as the learning rate factor- the factor of + # 0.5 / args.xent_regularize is suitable as it means the xent + # final-layer learns at a rate independent of the regularization + # constant; and the 0.5 was tuned so as to make the relative progress + # similar in the xent and regular final layers. + relu-batchnorm-layer name=prefinal-xent input=tdnn3 dim=$tdnn_dim target-rms=0.5 $tdnn_opts + output-layer name=output-xent dim=$num_targets learning-rate-factor=$learning_rate_factor max-change=1.5 $output_opts +EOF + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs/ +fi + + +if [ $stage -le 5 ]; then + if [[ $(hostname -f) == *.clsp.jhu.edu ]] && [ ! -d $dir/egs/storage ]; then + utils/create_split_dir.pl \ + /export/b0{3,4,5,6}/$USER/kaldi-data/egs/iam-$(date +'%m_%d_%H_%M')/s5/$dir/egs/storage $dir/egs/storage + fi + + steps/nnet3/chain/train.py --stage=$train_stage \ + --cmd="$cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.xent-regularize $xent_regularize \ + --chain.leaky-hmm-coefficient=0.1 \ + --chain.l2-regularize=0.00005 \ + --chain.apply-deriv-weights=true \ + --chain.lm-opts="--ngram-order=2 --no-prune-ngram-order=1 --num-extra-lm-states=1000" \ + --chain.frame-subsampling-factor=$frame_subsampling_factor \ + --chain.alignment-subsampling-factor=1 \ + --chain.left-tolerance 3 \ + --chain.right-tolerance 3 \ + --trainer.srand=$srand \ + --trainer.max-param-change=2.0 \ + --trainer.num-epochs=5 \ + --trainer.frames-per-iter=1500000 \ + --trainer.optimization.num-jobs-initial=3 \ + --trainer.optimization.num-jobs-final=5 \ + --trainer.dropout-schedule $dropout_schedule \ + --trainer.optimization.initial-effective-lrate=0.001 \ + --trainer.optimization.final-effective-lrate=0.0001 \ + --trainer.optimization.shrink-value=1.0 \ + --trainer.num-chunk-per-minibatch=32,16 \ + --trainer.optimization.momentum=0.0 \ + --egs.chunk-width=$chunk_width \ + --egs.chunk-left-context=$chunk_left_context \ + --egs.chunk-right-context=$chunk_right_context \ + --egs.chunk-left-context-initial=0 \ + --egs.chunk-right-context-final=0 \ + --egs.dir="$common_egs_dir" \ + --egs.opts="--frames-overlap-per-eg 0 --constrained false" \ + --cleanup.remove-egs=$remove_egs \ + --use-gpu=true \ + --reporting.email="$reporting_email" \ + --feat-dir=$train_data_dir \ + --tree-dir=$tree_dir \ + --lat-dir=$lat_dir \ + --dir=$dir || exit 1; +fi + +if [ $stage -le 6 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 $lang_decode \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 7 ]; then + frames_per_chunk=$(echo $chunk_width | cut -d, -f1) + for decode_set in test $maybe_val; do + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --extra-left-context $chunk_left_context \ + --extra-right-context $chunk_right_context \ + --extra-left-context-initial 0 \ + --extra-right-context-final 0 \ + --frames-per-chunk $frames_per_chunk \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1; + + steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ + data/$decode_set $dir/decode_${decode_set}{,_rescored} || exit 1 + done +fi + + +echo "Done. Date: $(date). Results:" +local/chain/compare_wer.sh $dir diff --git a/egs/bentham/v1/local/chain/tuning/run_e2e_cnn_1a.sh b/egs/bentham/v1/local/chain/tuning/run_e2e_cnn_1a.sh new file mode 100755 index 00000000000..716bdce3729 --- /dev/null +++ b/egs/bentham/v1/local/chain/tuning/run_e2e_cnn_1a.sh @@ -0,0 +1,166 @@ +#!/bin/bash +# Copyright 2017 Hossein Hadian + +# This script does end2end chain training (i.e. from scratch) +# ./local/chain/compare_wer.sh exp/chain/e2e_cnn_1a/ +# System e2e_cnn_1b +# WER 13.72 +# WER (rescored) 13.40 +# CER 6.56 +# CER (rescored) 6.33 +# WER val 13.51 +# WER (rescored) val 13.38 +# CER val 6.40 +# CER (rescored) val 6.29 +# Final train prob 0.1037 +# Final valid prob 0.0720 +# Final train prob (xent) +# Final valid prob (xent) +# Parameters 11.54M +# steps/info/chain_dir_info.pl exp/chain/e2e_cnn_1a +# exp/chain/e2e_cnn_1a: num-iters=26 nj=2..4 num-params=11.5M dim=40->17112 combine=0.054->0.054 (over 1) logprob:train/valid[16,25,final]=(0.078,0.102,0.104/0.051,0.069,0.072) +set -e + +# configs for 'chain' +stage=0 +train_stage=-10 +get_egs_stage=-10 +affix=1a +nj=30 + +# training options +tdnn_dim=450 +minibatch_size=150=100,64/300=50,32/600=25,16/1200=16,8 +common_egs_dir= +train_set=train +decode_val=true +lang_decode=data/lang +lang_rescore=data/lang_rescore_6g +if $decode_val; then maybe_val=val; else maybe_val= ; fi +# End configuration section. +echo "$0 $@" # Print the command line for logging + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh + +if ! cuda-compiled; then + cat <$lang/topo +fi + +if [ $stage -le 1 ]; then + steps/nnet3/chain/e2e/prepare_e2e.sh --nj 30 --cmd "$cmd" \ + --shared-phones true \ + --type biphone \ + data/$train_set $lang $treedir + $cmd $treedir/log/make_phone_lm.log \ + cat data/$train_set/text \| \ + steps/nnet3/chain/e2e/text_to_phones.py data/lang \| \ + utils/sym2int.pl -f 2- data/lang/phones.txt \| \ + chain-est-phone-lm --num-extra-lm-states=500 \ + ark:- $treedir/phone_lm.fst +fi + +if [ $stage -le 2 ]; then + echo "$0: creating neural net configs using the xconfig parser"; + num_targets=$(tree-info $treedir/tree | grep num-pdfs | awk '{print $2}') + common1="height-offsets=-2,-1,0,1,2 num-filters-out=36" + common2="height-offsets=-2,-1,0,1,2 num-filters-out=70" + mkdir -p $dir/configs + cat < $dir/configs/network.xconfig + input dim=40 name=input + + conv-relu-batchnorm-layer name=cnn1 height-in=40 height-out=40 time-offsets=-3,-2,-1,0,1,2,3 $common1 + conv-relu-batchnorm-layer name=cnn2 height-in=40 height-out=20 time-offsets=-2,-1,0,1,2 $common1 height-subsample-out=2 + conv-relu-batchnorm-layer name=cnn3 height-in=20 height-out=20 time-offsets=-4,-2,0,2,4 $common2 + conv-relu-batchnorm-layer name=cnn4 height-in=20 height-out=10 time-offsets=-4,-2,0,2,4 $common2 height-subsample-out=2 + relu-batchnorm-layer name=tdnn1 input=Append(-4,-2,0,2,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn2 input=Append(-4,0,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn3 input=Append(-4,0,4) dim=$tdnn_dim + relu-batchnorm-layer name=tdnn4 input=Append(-4,0,4) dim=$tdnn_dim + ## adding the layers for chain branch + relu-batchnorm-layer name=prefinal-chain dim=$tdnn_dim target-rms=0.5 $output_opts + output-layer name=output include-log-softmax=false dim=$num_targets max-change=1.5 $output_opts +EOF + + steps/nnet3/xconfig_to_configs.py --xconfig-file $dir/configs/network.xconfig --config-dir $dir/configs +fi + +if [ $stage -le 3 ]; then + # no need to store the egs in a shared storage because we always + # remove them. Anyway, it takes only 5 minutes to generate them. + + steps/nnet3/chain/e2e/train_e2e.py --stage $train_stage \ + --cmd "$cmd" \ + --feat.cmvn-opts="--norm-means=false --norm-vars=false" \ + --chain.leaky-hmm-coefficient 0.1 \ + --chain.l2-regularize 0.00005 \ + --chain.apply-deriv-weights false \ + --egs.dir "$common_egs_dir" \ + --egs.stage $get_egs_stage \ + --egs.opts "--num_egs_diagnostic 100 --num_utts_subset 400" \ + --chain.frame-subsampling-factor 4 \ + --chain.alignment-subsampling-factor 4 \ + --trainer.num-chunk-per-minibatch $minibatch_size \ + --trainer.frames-per-iter 1000000 \ + --trainer.num-epochs 4 \ + --trainer.optimization.momentum 0 \ + --trainer.optimization.num-jobs-initial 2 \ + --trainer.optimization.num-jobs-final 4 \ + --trainer.optimization.initial-effective-lrate 0.001 \ + --trainer.optimization.final-effective-lrate 0.0001 \ + --trainer.optimization.shrink-value 1.0 \ + --trainer.max-param-change 2.0 \ + --cleanup.remove-egs true \ + --feat-dir data/${train_set} \ + --tree-dir $treedir \ + --dir $dir || exit 1; +fi + +if [ $stage -le 4 ]; then + # The reason we are using data/lang here, instead of $lang, is just to + # emphasize that it's not actually important to give mkgraph.sh the + # lang directory with the matched topology (since it gets the + # topology file from the model). So you could give it a different + # lang directory, one that contained a wordlist and LM of your choice, + # as long as phones.txt was compatible. + + utils/mkgraph.sh \ + --self-loop-scale 1.0 $lang_decode \ + $dir $dir/graph || exit 1; +fi + +if [ $stage -le 5 ]; then + for decode_set in test $maybe_val; do + steps/nnet3/decode.sh --acwt 1.0 --post-decode-acwt 10.0 \ + --nj $nj --cmd "$cmd" \ + $dir/graph data/$decode_set $dir/decode_$decode_set || exit 1; + + steps/lmrescore_const_arpa.sh --cmd "$cmd" $lang_decode $lang_rescore \ + data/$decode_set $dir/decode_${decode_set}{,_rescored} || exit 1 + done +fi + +echo "Done. Date: $(date). Results:" +local/chain/compare_wer.sh $dir diff --git a/egs/bentham/v1/local/check_tools.sh b/egs/bentham/v1/local/check_tools.sh new file mode 100755 index 00000000000..5b4d3107d3b --- /dev/null +++ b/egs/bentham/v1/local/check_tools.sh @@ -0,0 +1,43 @@ +#!/bin/bash -u + +# Copyright 2015 (c) Johns Hopkins University (Jan Trmal ) + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + +[ -f ./path.sh ] && . ./path.sh +set +e + +command -v python3 >&/dev/null \ + || { echo >&2 "python3 not found on PATH. You will have to install Python3, preferably >= 3.6"; exit 1; } + +python3 -c "import numpy" +if [ $? -ne 0 ] ; then + echo >&2 "This recipe needs numpy installed." + exit 1 +fi + +python3 -c "import scipy" +if [ $? -ne 0 ] ; then + echo >&2 "This recipe needs scipy installed." + exit 1 +fi + +python3 -c "import scipy.misc; scipy.misc.__dict__['imread']" +if [ $? -ne 0 ] ; then + echo >&2 "This recipe needs scipy-image and Pillow installed." + exit 1 +fi + + +exit 0 diff --git a/egs/bentham/v1/local/create_splits.sh b/egs/bentham/v1/local/create_splits.sh new file mode 100755 index 00000000000..93e8bf1b12e --- /dev/null +++ b/egs/bentham/v1/local/create_splits.sh @@ -0,0 +1,39 @@ +#!/bin/bash +# Copyright 2018 Desh Raj (Johns Hopkins University) + +# This script reads the extracted Bentham database files and creates +# the following files (for all the data subsets): +# text, utt2spk, images.scp. + +download_dir=$1 +save_dir=$2 +mkdir -p $save_dir/{train,val,test} +touch $save_dir/{train,val,test}/{text,images.scp,utt2spk,spk2utt} + +partition_dir=$download_dir"/gt/Partitions/" +lines_dir=$download_dir"/gt/Images/Lines/" +text_dir=$download_dir"/gt/Transcriptions/" + +function split { + echo "Creating $1 split" + split_dir=$save_dir/$1 + line_file=$partition_dir/$2 + + while read -r line; do + name="$line" + spkid=${name:0:11} + echo -n $name" " | cat - $text_dir/$name* >> $split_dir/text + echo >> $split_dir/text + echo $name $lines_dir"/"$name".png" >> $split_dir/images.scp + echo $name $spkid >> $split_dir/utt2spk + done < "$line_file" + + sed -i '/^\s*$/d' $split_dir/images.scp + sed -i '/^\s*$/d' $split_dir/text + sed -i '/^\s*$/d' $split_dir/utt2spk + utils/utt2spk_to_spk2utt.pl $split_dir/utt2spk > $split_dir/spk2utt +} + +split train TrainLines.lst +split val ValidationLines.lst +split test TestLines.lst diff --git a/egs/bentham/v1/local/download_bentham_text.sh b/egs/bentham/v1/local/download_bentham_text.sh new file mode 100755 index 00000000000..e09403718a1 --- /dev/null +++ b/egs/bentham/v1/local/download_bentham_text.sh @@ -0,0 +1,32 @@ +#!/bin/bash +# Copyright 2018 Desh Raj +# Apache 2.0 + +## Download all written works of Jeremy Bentham for the Bentham HWR task LM training + +baseurl='http://oll.libertyfund.org/titles/' +savedir=$1 + +mkdir -p $savedir + +declare -a texts=("bentham-the-works-of-jeremy-bentham-vol-1/simple" + "bentham-the-works-of-jeremy-bentham-vol-2/simple" + "bentham-the-works-of-jeremy-bentham-vol-3/simple" + "bentham-the-works-of-jeremy-bentham-vol-5-scotch-reform-real-property-codification-petitions/simple" + "bentham-the-works-of-jeremy-bentham-vol-6/simple" + "bentham-the-works-of-jeremy-bentham-vol-7-rationale-of-judicial-evidence-part-2/simple" + "bentham-the-works-of-jeremy-bentham-vol-8/simple" + "bentham-the-works-of-jeremy-bentham-vol-9-constitutional-code" + "bentham-the-works-of-jeremy-bentham-vol-10-memoirs-part-i-and-correspondence/simple" + "bentham-the-works-of-jeremy-bentham-vol-11-memoirs-of-bentham-part-ii-and-analytical-index") + +counter=1 +for i in "${texts[@]}" +do + echo "Downloading $baseurl$i" + curl -s -N {$baseurl}{$i} | sed -e 's/<[^>]*>//g' > $savedir"/bentham"$counter".txt" + ((counter++)) +done + +cat $savedir"/*.txt" > $savedir"/complete.txt" +rm $savedir"/bentham*.txt" diff --git a/egs/bentham/v1/local/extract_features.sh b/egs/bentham/v1/local/extract_features.sh new file mode 100755 index 00000000000..460e467e99c --- /dev/null +++ b/egs/bentham/v1/local/extract_features.sh @@ -0,0 +1,49 @@ +#!/bin/bash + +# Copyright 2017 Yiwen Shao +# 2018 Ashish Arora + +# Apache 2.0 +# This script runs the make features script in parallel. + +nj=4 +cmd=run.pl +feat_dim=40 +augment='no_aug' +fliplr=false +echo "$0 $@" + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh || exit 1; + +data=$1 +featdir=$data/data +scp=$data/images.scp +logdir=$data/log + +mkdir -p $logdir +mkdir -p $featdir + +# make $featdir an absolute pathname +featdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print $dir; ' $featdir ${PWD}` + +for n in $(seq $nj); do + split_scps="$split_scps $logdir/images.$n.scp" +done + +# split images.scp +utils/split_scp.pl $scp $split_scps || exit 1; + +$cmd JOB=1:$nj $logdir/extract_features.JOB.log \ + image/ocr/make_features.py $logdir/images.JOB.scp \ + --allowed_len_file_path $data/allowed_lengths.txt \ + --num-channels 4 \ + --feat-dim $feat_dim --fliplr $fliplr --augment_type $augment \| \ + copy-feats --compress=true --compression-method=7 \ + ark:- ark,scp:$featdir/images.JOB.ark,$featdir/images.JOB.scp + +## aggregates the output scp's to get feats.scp +for n in $(seq $nj); do + cat $featdir/images.$n.scp || exit 1; +done > $data/feats.scp || exit 1 diff --git a/egs/bentham/v1/local/gen_topo.py b/egs/bentham/v1/local/gen_topo.py new file mode 100755 index 00000000000..540bfbcf270 --- /dev/null +++ b/egs/bentham/v1/local/gen_topo.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python + +# Copyright 2017 (author: Chun-Chieh Chang) + +# Generate a topology file. This allows control of the number of states in the +# non-silence HMMs, and in the silence HMMs. This is a modified version of +# 'utils/gen_topo.pl'. The difference is that this creates two topologies for +# the non-silence HMMs. The number of states for punctuations is different than +# the number of states for other characters. + +from __future__ import print_function +import argparse +import string + +parser = argparse.ArgumentParser(description="Usage: steps/nnet3/chain/gen_topo.py " + " " + "e.g.: steps/nnet3/chain/gen_topo.pl 4:5:6:7:8:9:10 1:2:3\n", + epilog="See egs/swbd/s5c/local/chain/train_tdnn_a.sh for example of usage."); +parser.add_argument("num_nonsil_states", type=int, help="number of states for nonsilence phones"); +parser.add_argument("num_sil_states", type=int, help="number of states for silence phones"); +parser.add_argument("num_punctuation_states", type=int, help="number of states for punctuation"); +parser.add_argument("nonsilence_phones", type=str, + help="List of non-silence phones as integers, separated by colons, e.g. 4:5:6:7:8:9"); +parser.add_argument("silence_phones", type=str, + help="List of silence phones as integers, separated by colons, e.g. 1:2:3"); +parser.add_argument("phone_list", type=str, help="file containing all phones and their corresponding number."); + +args = parser.parse_args() + +silence_phones = [ int(x) for x in args.silence_phones.split(":") ] +nonsilence_phones = [ int(x) for x in args.nonsilence_phones.split(":") ] +all_phones = silence_phones + nonsilence_phones + +punctuation_phones = [] +exclude = set("!(),.?;:'-\"") +with open(args.phone_list) as f: + for line in f: + line = line.strip() + phone = line.split(' ')[0] + if len(phone) == 1 and phone in exclude: + punctuation_phones.append(int(line.split(' ')[1])) +# For nonsilence phones that are not punctuations +print("") +print("") +print("") +print(" ".join([str(x) for x in nonsilence_phones if x not in punctuation_phones])) +print("") +for x in range(0, args.num_nonsil_states): + xp1 = x + 1 + print(" " + str(x) + " " + str(x) + " " + str(x) + " 0.75 " + str(xp1) + " 0.25 ") +print(" " + str(args.num_nonsil_states) + " ") +print("") + +# For nonsilence phones that ar punctuations +print("") +print("") +print(" ".join([str(x) for x in nonsilence_phones if x in punctuation_phones])) +print("") +for x in range(0, args.num_punctuation_states): + xp1 = x + 1 + print(" " + str(x) + " " + str(x) + " " + str(x) + " 0.75 " + str(xp1) + " 0.25 ") +print(" " + str(args.num_punctuation_states) + " ") +print("") + +# For silence phones +print("") +print("") +print(" ".join([str(x) for x in silence_phones])) +print("") +if(args.num_sil_states > 1): + transp = 1.0 / (args.num_sil_states - 1) + + state_str = " 0 0 " + for x in range(0, (args.num_sil_states - 1)): + state_str = state_str + " " + str(x) + " " + str(transp) + " " + state_str = state_str + "" + print(state_str) + + for x in range(1, (args.num_sil_states - 1)): + state_str = " " + str(x) + " " + str(x) + " " + for y in range(1, args.num_sil_states): + state_str = state_str + " " + str(y) + " " + str(transp) + " " + state_str = state_str + "" + print(state_str) + second_last = args.num_sil_states - 1 + print(" " + str(second_last) + " " + str(second_last) + " " + str(second_last) + " 0.75 " + str(args.num_sil_states) + " 0.25 ") + print(" " + str(args.num_sil_states) + " ") +else: + print(" 0 0 0 0.75 1 0.25 ") + print(" " + str(args.num_sil_states) + " ") +print("") +print("") diff --git a/egs/bentham/v1/local/prepare_data.sh b/egs/bentham/v1/local/prepare_data.sh new file mode 100755 index 00000000000..bbcc9863611 --- /dev/null +++ b/egs/bentham/v1/local/prepare_data.sh @@ -0,0 +1,69 @@ +#!/bin/bash + +# Copyright 2018 Desh Raj (Johns Hopkins University) + +# Apache 2.0 + +# This script downloads the Bentham handwriting database and prepares the training +# and test data (i.e text, images.scp, utt2spk and spk2utt) by calling create_splits.sh. + +# In addition, it downloads data for all texts of Bentham for LM training purpose. + +stage=0 +download_dir=data/local/download/ +database_dir="" +text_corpus_dir="" + +mkdir -p $download_dir + +. ./cmd.sh +. ./path.sh +. ./utils/parse_options.sh || exit 1; + +BENTHAM_IMAGES_URL='http://transcriptorium.eu/~tsdata/BenthamR0/BenthamDatasetR0-Images.zip' +BENTHAM_GT_URL='http://transcriptorium.eu/~tsdata/BenthamR0/BenthamDatasetR0-GT.zip' +bentham_images=$database_dir"/images.zip" +bentham_gt=$database_dir"/gt.zip" +bentham_text=$download_dir"/text" + +# download and extract images and transcriptions +if [ ! -f $bentham_images ]; then + echo "Downloading images and transcriptions to $database_dir" + mkdir -p $database_dir + wget $BENTHAM_IMAGES_URL -O $bentham_images + wget $BENTHAM_GT_URL -O $bentham_gt +else + echo "Not downloading since corpus already exists" +fi + +if [ ! -d $download_dir/"gt" ]; then + unzip $bentham_gt -d $download_dir + mv $download_dir"/BenthamDatasetR0-GT" $download_dir"/gt" +else + echo "Local extracted corpus already exists" +fi + +# Download extra Bentham text for LM training +if [ -d $text_corpus_dir ]; then + echo "$0: Not downloading Bentham text corpus as it is already there." +else + local/download_bentham_text.sh $text_corpus_dir +fi + +# Copy extra Bentham text to local +if [ -d $bentham_text ]; then + echo "$0: Not copying as local Bentham already present." +else + mkdir -p $bentham_text + cp $text_corpus_dir/Bentham-Text/* $bentham_text + echo "$0: Done copying extra Bentham text to local." +fi + +# Creating train, val, and test splits for all directories +if [ -d data/train ]; then + echo "Data splits and files already exist. Not creating again." +else + echo "Creating train, val, and test splits and corresponding files.." + local/create_splits.sh $download_dir "data/" +fi + diff --git a/egs/bentham/v1/local/prepare_dict.sh b/egs/bentham/v1/local/prepare_dict.sh new file mode 100755 index 00000000000..22db5ae834d --- /dev/null +++ b/egs/bentham/v1/local/prepare_dict.sh @@ -0,0 +1,26 @@ +#!/usr/bin/env bash + +# Copyright 2017 Hossein Hadian +# 2017 Babak Rekabdar +# 2017 Chun Chieh Chang +# 2017 Ashish Arora + +# This script prepares the dictionary. + +set -e +dir=data/local/dict +. ./utils/parse_options.sh || exit 1; + +mkdir -p $dir + +local/prepare_lexicon.py $dir + +cut -d' ' -f2- $dir/lexicon.txt | sed 's/SIL//g' | tr ' ' '\n' | sort -u | sed '/^$/d' >$dir/nonsilence_phones.txt || exit 1; + +echo ' SIL' >> $dir/lexicon.txt + +echo SIL > $dir/silence_phones.txt + +echo SIL >$dir/optional_silence.txt + +echo -n "" >$dir/extra_questions.txt diff --git a/egs/bentham/v1/local/prepare_lexicon.py b/egs/bentham/v1/local/prepare_lexicon.py new file mode 100755 index 00000000000..3de96056c2a --- /dev/null +++ b/egs/bentham/v1/local/prepare_lexicon.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python3 + +# Copyright 2017 Babak Rekabdar +# 2017 Hossein Hadian +# 2017 Chun Chieh Chang +# 2017 Ashish Arora +# Apache 2.0 + +# This script prepares lexicon for BPE. It gets the set of all words that occur in data/train/text. +# Since this lexicon is based on BPE, it replaces '|' with silence. + +import argparse +import os + +parser = argparse.ArgumentParser(description="""Creates the list of characters and words in lexicon""") +parser.add_argument('dir', type=str, help='output path') +args = parser.parse_args() + +### main ### +lex = {} +text_path = os.path.join('data', 'train', 'text') +with open(text_path, 'r', encoding='utf-8') as f: + for line in f: + line_vect = line.strip().split(' ') + for i in range(1, len(line_vect)): + characters = list(line_vect[i]) + characters = " ".join([ 'SIL' if char == '|' else char for char in characters]) + characters = list(characters) + characters = "".join([ '' if char == '#' else char for char in characters]) + lex[line_vect[i]] = characters + +with open(os.path.join(args.dir, 'lexicon.txt'), 'w', encoding='utf-8') as fp: + for key in sorted(lex): + fp.write(key + " " + lex[key] + "\n") diff --git a/egs/bentham/v1/local/score.sh b/egs/bentham/v1/local/score.sh new file mode 100755 index 00000000000..1d84815fc69 --- /dev/null +++ b/egs/bentham/v1/local/score.sh @@ -0,0 +1,6 @@ + +#!/bin/bash + + +steps/scoring/score_kaldi_wer.sh "$@" +steps/scoring/score_kaldi_cer.sh --stage 2 "$@" diff --git a/egs/bentham/v1/local/train_lm.sh b/egs/bentham/v1/local/train_lm.sh new file mode 100755 index 00000000000..48632a90769 --- /dev/null +++ b/egs/bentham/v1/local/train_lm.sh @@ -0,0 +1,141 @@ +#!/bin/bash + +# Copyright 2016 Vincent Nguyen +# 2016 Johns Hopkins University (author: Daniel Povey) +# 2017 Ashish Arora +# 2017 Hossein Hadian +# 2018 Desh Raj +# Apache 2.0 +# +# This script trains an LM on the Bentham text corpus and training transcriptions. +# It is based on the example scripts distributed with PocoLM + +# It will check if pocolm is installed and if not will proceed with installation + +set -e +stage=0 +vocab_size=50000 + +echo "$0 $@" # Print the command line for logging +. ./utils/parse_options.sh || exit 1; + +dir=data/local/local_lm +lm_dir=${dir}/data +bentham_text_dir=data/local/download/text/ + +mkdir -p $dir +. ./path.sh || exit 1; # for KALDI_ROOT +export PATH=$KALDI_ROOT/tools/pocolm/scripts:$PATH +( # First make sure the pocolm toolkit is installed. + cd $KALDI_ROOT/tools || exit 1; + if [ -d pocolm ]; then + echo Not installing the pocolm toolkit since it is already there. + else + echo "$0: Please install the PocoLM toolkit with: " + echo " cd ../../../tools; extras/install_pocolm.sh; cd -" + exit 1; + fi +) || exit 1; + +bypass_metaparam_optim_opt= +# If you want to bypass the metaparameter optimization steps with specific metaparameters +# un-comment the following line, and change the numbers to some appropriate values. +# You can find the values from output log of train_lm.py. +# These example numbers of metaparameters is for 4-gram model (with min-counts) +# running with train_lm.py. +# The dev perplexity should be close to the non-bypassed model. +#bypass_metaparam_optim_opt= +# Note: to use these example parameters, you may need to remove the .done files +# to make sure the make_lm_dir.py be called and tain only 3-gram model +#for order in 3; do +#rm -f ${lm_dir}/${num_word}_${order}.pocolm/.done + +if [ $stage -le 0 ]; then + mkdir -p ${dir}/data + mkdir -p ${dir}/data/text + + echo "$0: Getting the Data sources" + + rm ${dir}/data/text/* 2>/dev/null || true + + # Using Bentham text with last 5000 lines for dev + + cat $bentham_text_dir/complete.txt | \ + sed '/^\s*$/d' | \ + utils/lang/bpe/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \ + | sed 's/@@//g' > ${dir}/bentham.txt + tail -n +5000 ${dir}/bentham.txt > ${dir}/data/text/bentham.txt + + # use the validation data as the dev set. + # Note: the name 'dev' is treated specially by pocolm, it automatically + # becomes the dev set. + head -5000 ${dir}/bentham.txt > ${dir}/data/text/dev.txt + + # use the training data as an additional data source. + # we can later fold the dev data into this. + cat data/train/text | cut -d " " -f 2- > ${dir}/data/text/hwr.txt + + # for reporting perplexities, we'll use the "real" dev set. + # (the validation data is used as ${dir}/data/text/dev.txt to work + # out interpolation weights.) + # note, we can't put it in ${dir}/data/text/, because then pocolm would use + # it as one of the data sources. + cut -d " " -f 2- < data/val/text > ${dir}/data/real_dev_set.txt + + # get the wordlist from Bentham text + cat ${dir}/data/text/{bentham,hwr}.txt | tr '[:space:]' '[\n*]' | grep -v "^\s*$" | sort | uniq -c | sort -bnr > ${dir}/data/word_count + head -n $vocab_size ${dir}/data/word_count | awk '{print $2}' > ${dir}/data/wordlist +fi + +order=6 + +if [ $stage -le 1 ]; then + # decide on the vocabulary. + # Note: you'd use --wordlist if you had a previously determined word-list + # that you wanted to use. + # Note: if you have more than one order, use a certain amount of words as the + # vocab and want to restrict max memory for 'sort', + echo "$0: training the unpruned LM" + min_counts='bentham=1 hwr=1' + wordlist=${dir}/data/wordlist + + lm_name="`basename ${wordlist}`_${order}" + if [ -n "${min_counts}" ]; then + lm_name+="_`echo ${min_counts} | tr -s "[:blank:]" "_" | tr "=" "-"`" + fi + unpruned_lm_dir=${lm_dir}/${lm_name}.pocolm + + train_lm.py --wordlist=${wordlist} --num-splits=10 --warm-start-ratio=20 \ + --limit-unk-history=true \ + ${bypass_metaparam_optim_opt} \ + ${dir}/data/text ${order} ${lm_dir}/work ${unpruned_lm_dir} + + mkdir -p ${dir}/data/arpa + format_arpa_lm.py ${unpruned_lm_dir} | gzip -c > ${dir}/data/arpa/${order}gram_unpruned.arpa.gz + + get_data_prob.py ${dir}/data/real_dev_set.txt ${unpruned_lm_dir} 2>&1 | grep -F '[perplexity' +fi + +if [ $stage -le 2 ]; then + echo "$0: pruning the LM (to larger size)" + # Using 1 million n-grams for a big LM for rescoring purposes. + size=1000000 + prune_lm_dir.py --target-num-ngrams=$size --initial-threshold=0.02 ${unpruned_lm_dir} ${dir}/data/lm_${order}_prune_big + + get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_big 2>&1 | grep -F '[perplexity' + + mkdir -p ${dir}/data/arpa + format_arpa_lm.py ${dir}/data/lm_${order}_prune_big | gzip -c > ${dir}/data/arpa/${order}gram_big.arpa.gz +fi + +if [ $stage -le 3 ]; then + echo "$0: pruning the LM (to smaller size)" + # Using 500,000 n-grams for a smaller LM for graph building. Prune from the + # bigger-pruned LM, it'll be faster. + size=500000 + prune_lm_dir.py --target-num-ngrams=$size ${dir}/data/lm_${order}_prune_big ${dir}/data/lm_${order}_prune_small + + get_data_prob.py ${dir}/data/real_dev_set.txt ${dir}/data/lm_${order}_prune_small 2>&1 | grep -F '[perplexity' + + format_arpa_lm.py ${dir}/data/lm_${order}_prune_small | gzip -c > ${dir}/data/arpa/${order}gram_small.arpa.gz +fi diff --git a/egs/bentham/v1/local/wer_output_filter b/egs/bentham/v1/local/wer_output_filter new file mode 100755 index 00000000000..24691a160a9 --- /dev/null +++ b/egs/bentham/v1/local/wer_output_filter @@ -0,0 +1,31 @@ +#!/usr/bin/env python3 +# Copyright 2017 Hossein Hadian + +# This is a filter used in scoring. It separates all +# punctuations from words. For e.g. this sentence: + +# "They have come!" he said reverently, gripping his +# hands. "Isn't it a glorious thing! Long awaited." + +# is converted to this: + +# " They have come ! " he said reverently , gripping his +# hands . " Isn ' t it a glorious thing ! Long awaited . " + +# Sample BPE-based output: +# |He |ro se |from |his |b re ak f as t - s ch oo l |b en ch + +import sys +import re + +punctuations = "!(),.?;:'-\"" +escaped_punctuations = re.escape(punctuations) + +for line in sys.stdin: + words = line.strip().split() + uttid = words[0] + transcript = ''.join(words[1:]) + transcript = transcript.replace('|', ' ') + split_transcript = " ".join(re.split("([{}])".format(escaped_punctuations), + transcript)).strip() + print("{} {}".format(uttid, split_transcript)) diff --git a/egs/bentham/v1/path.sh b/egs/bentham/v1/path.sh new file mode 100755 index 00000000000..2d17b17a84a --- /dev/null +++ b/egs/bentham/v1/path.sh @@ -0,0 +1,6 @@ +export KALDI_ROOT=`pwd`/../../.. +[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh +export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH +[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 +. $KALDI_ROOT/tools/config/common_path.sh +export LC_ALL=C diff --git a/egs/bentham/v1/run_end2end.sh b/egs/bentham/v1/run_end2end.sh new file mode 100755 index 00000000000..63c034e41f6 --- /dev/null +++ b/egs/bentham/v1/run_end2end.sh @@ -0,0 +1,121 @@ +#!/bin/bash +# Copyright 2018 Ashish Arora (Johns Hopkins University) +# 2018 Desh Raj (Johns Hopkins University) + +set -e +stage=0 +nj=20 +# bentham_hwr_database points to the official database path on the JHU grid. If you have not +# already downloaded the data, you will have to first download it and then name the Images +# and Ground Truth zipped files as images.zip and gt.zip. Then, point the path below to the +# location where your zipped files are present on the grid. +bentham_hwr_database=/export/corpora5/handwriting_ocr/hwr1/ICDAR-HTR-Competition-2015 +# bentham_text_database points to the database path on the JHU grid. +# It contains all of the written works of Bentham, and can be used to train +# an LM for the HWR task. We have provided a script which downloads the data +# and saves it to the location provided below. +bentham_text_corpus=/export/corpora5/handwriting_ocr/hwr1/ICDAR-HTR-Competition-2015/Bentham-Text + +. ./cmd.sh ## You'll want to change cmd.sh to something that will work on your system. + ## This relates to the queue. +. ./path.sh +. ./utils/parse_options.sh # e.g. this parses the above options + # if supplied. + + +./local/check_tools.sh + +if [ $stage -le 0 ]; then + echo "$0: Preparing data..." + local/prepare_data.sh --database-dir $bentham_hwr_database \ + --text-corpus-dir $bentham_text_corpus +fi + +if [ $stage -le 1 ]; then + image/get_image2num_frames.py data/train # This will be needed for the next command + # The next command creates a "allowed_lengths.txt" file in data/train + # which will be used by local/make_features.py to enforce the images to + # have allowed lengths. The allowed lengths will be spaced by 10% difference in length. + image/get_allowed_lengths.py --frame-subsampling-factor 4 10 data/train + echo "$(date) Extracting features, creating feats.scp file" + for dataset in train val test; do + local/extract_features.sh --nj $nj --cmd "$cmd" --feat-dim 40 data/$dataset + steps/compute_cmvn_stats.sh data/$dataset + done + utils/fix_data_dir.sh data/train +fi + +if [ $stage -le 2 ]; then + echo "$0: Preparing BPE..." + # getting non-silence phones. + cut -d' ' -f2- data/train/text | \ +python3 <( +cat << "END" +import os, sys, io; +infile = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8'); +output = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8'); +phone_dict = dict(); +for line in infile: + line_vect = line.strip().split(); + for word in line_vect: + for phone in word: + phone_dict[phone] = phone; +for phone in phone_dict.keys(): + output.write(phone+ '\n'); +END + ) > data/local/phones.txt + + cut -d' ' -f2- data/train/text > data/local/train_data.txt + cat data/local/phones.txt data/local/train_data.txt | \ + utils/lang/bpe/prepend_words.py | \ + utils/lang/bpe/learn_bpe.py -s 700 > data/local/bpe.txt + for set in test train val; do + cut -d' ' -f1 data/$set/text > data/$set/ids + cut -d' ' -f2- data/$set/text | \ + utils/lang/bpe/prepend_words.py | utils/lang/bpe/apply_bpe.py -c data/local/bpe.txt \ + | sed 's/@@//g' > data/$set/bpe_text + mv data/$set/text data/$set/text.old + paste -d' ' data/$set/ids data/$set/bpe_text > data/$set/text + done +fi + +if [ $stage -le 3 ]; then + echo "$0: Estimating a language model for decoding..." + local/train_lm.sh +fi + +if [ $stage -le 4 ]; then + echo "$0: Preparing dictionary and lang..." + local/prepare_dict.sh + # This recipe uses byte-pair encoding, the silences are part of the words' pronunciations. + # So we set --sil-prob to 0.0 + utils/prepare_lang.sh --num-sil-states 4 --num-nonsil-states 8 --sil-prob 0.0 --position-dependent-phones false \ + data/local/dict "" data/lang/temp data/lang + silphonelist=`cat data/lang/phones/silence.csl` + nonsilphonelist=`cat data/lang/phones/nonsilence.csl` + local/gen_topo.py 8 4 4 $nonsilphonelist $silphonelist data/lang/phones.txt >data/lang/topo + utils/lang/bpe/add_final_optional_silence.sh --final-sil-prob 0.5 data/lang + + utils/format_lm.sh data/lang data/local/local_lm/data/arpa/6gram_big.arpa.gz \ + data/local/dict/lexicon.txt data/lang + utils/build_const_arpa_lm.sh data/local/local_lm/data/arpa/6gram_unpruned.arpa.gz \ + data/lang data/lang_rescore_6g +fi + +if [ $stage -le 5 ]; then + echo "$0: Calling the flat-start chain recipe..." + local/chain/run_e2e_cnn.sh +fi + +if [ $stage -le 6 ]; then + echo "$0: Aligning the training data using the e2e chain model..." + steps/nnet3/align.sh --nj 50 --cmd "$cmd" \ + --use-gpu false \ + --scale-opts '--transition-scale=1.0 --self-loop-scale=1.0 --acoustic-scale=1.0' \ + data/train data/lang exp/chain/e2e_cnn_1a exp/chain/e2e_ali_train +fi + +if [ $stage -le 7 ]; then + echo "$0: Building a tree and training a regular chain model using the e2e alignments..." + local/chain/run_cnn_e2eali.sh +fi diff --git a/egs/bentham/v1/steps b/egs/bentham/v1/steps new file mode 120000 index 00000000000..1b186770dd1 --- /dev/null +++ b/egs/bentham/v1/steps @@ -0,0 +1 @@ +../../wsj/s5/steps/ \ No newline at end of file diff --git a/egs/bentham/v1/utils b/egs/bentham/v1/utils new file mode 120000 index 00000000000..a3279dc8679 --- /dev/null +++ b/egs/bentham/v1/utils @@ -0,0 +1 @@ +../../wsj/s5/utils/ \ No newline at end of file diff --git a/egs/cifar/v1/image/get_allowed_lengths.py b/egs/cifar/v1/image/get_allowed_lengths.py index 02321fdd2df..44e17028695 100755 --- a/egs/cifar/v1/image/get_allowed_lengths.py +++ b/egs/cifar/v1/image/get_allowed_lengths.py @@ -117,7 +117,7 @@ def find_allowed_durations(start_len, end_len, args): (length // args.frame_subsampling_factor)) allowed_lengths.append(length) fp.write("{}\n".format(int(length))) - length *= args.factor + length = max(length * args.factor, length + args.frame_subsampling_factor) return allowed_lengths diff --git a/egs/cifar/v1/image/ocr/make_features.py b/egs/cifar/v1/image/ocr/make_features.py index a11cbcc7a82..aa909f596c9 100755 --- a/egs/cifar/v1/image/ocr/make_features.py +++ b/egs/cifar/v1/image/ocr/make_features.py @@ -4,6 +4,7 @@ # 2017 Ashish Arora # 2017 Yiwen Shao # 2018 Hossein Hadian +# 2018 Desh Raj """ This script converts images to Kaldi-format feature matrices. The input to this script is the path to a data directory, e.g. "data/train". This script @@ -88,10 +89,16 @@ def horizontal_pad(im, allowed_lengths = None): left_padding = int(padding // 2) right_padding = padding - left_padding dim_y = im.shape[0] # height - im_pad = np.concatenate((255 * np.ones((dim_y, left_padding, args.num_channels), - dtype=int), im), axis=1) - im_pad1 = np.concatenate((im_pad, 255 * np.ones((dim_y, right_padding, args.num_channels), - dtype=int)), axis=1) + if args.num_channels in [1,4]: + im_pad = np.concatenate((255 * np.ones((dim_y, left_padding), + dtype=int), im), axis=1) + im_pad1 = np.concatenate((im_pad, 255 * np.ones((dim_y, right_padding), + dtype=int)), axis=1) + else: + im_pad = np.concatenate((255 * np.ones((dim_y, left_padding, args.num_channels), + dtype=int), im), axis=1) + im_pad1 = np.concatenate((im_pad, 255 * np.ones((dim_y, right_padding, args.num_channels), + dtype=int)), axis=1) return im_pad1 def get_scaled_image_aug(im, mode='normal'): @@ -169,7 +176,10 @@ def vertical_shift(im, mode='normal'): line_vect = line.split(' ') image_id = line_vect[0] image_path = line_vect[1] - im = misc.imread(image_path) + if args.num_channels == 4: + im = misc.imread(image_path, mode='L') + else: + im = misc.imread(image_path) if args.fliplr: im = np.fliplr(im) if args.augment_type == 'no_aug' or 'random_shift': @@ -184,7 +194,7 @@ def vertical_shift(im, mode='normal'): im = vertical_shift(im, 'normal') elif args.augment_type == 'random_shift': im = vertical_shift(im, 'notmid') - if args.num_channels == 1: + if args.num_channels in [1,4]: data = np.transpose(im, (1, 0)) elif args.num_channels == 3: H = im.shape[0]