diff --git a/egs/multi_en/s5/local/g2p/apply_g2p.sh b/egs/multi_en/s5/local/g2p/apply_g2p.sh deleted file mode 100755 index 8484155800d..00000000000 --- a/egs/multi_en/s5/local/g2p/apply_g2p.sh +++ /dev/null @@ -1,42 +0,0 @@ -#!/bin/bash - -# Copyright 2016 Allen Guo -# 2017 Xiaohui Zhang -# Apache License 2.0 - -# This script applies a trained Phonetisarus G2P model to -# synthesize pronunciations for missing words (i.e., words in -# transcripts but not the lexicon), and output the expanded lexicon. - -var_counts=1 - -. ./path.sh || exit 1 -. parse_options.sh || exit 1; - -if [ $# -ne "4" ]; then - echo "Usage: $0 " - exit 1 -fi - -model=$1 -workdir=$2 -lexicon=$3 -outlexicon=$4 - -mkdir -p $workdir - -# awk command from http://stackoverflow.com/questions/2626274/print-all-but-the-first-three-columns -echo 'Gathering missing words...' -cat data/*/train/text | \ - local/count_oovs.pl $lexicon | \ - awk '{if (NF > 3 ) {for(i=4; i $workdir/missing.txt -cat $workdir/missing.txt | \ - grep "^[a-z]*$" > $workdir/missing_onlywords.txt - -echo 'Synthesizing pronunciations for missing words...' -phonetisaurus-apply --nbest $var_counts --model $model --thresh 5 --accumulate --word_list $workdir/missing_onlywords.txt > $workdir/missing_g2p_${var_counts}.txt - -echo "Adding new pronunciations to $lexicon" -cat "$lexicon" $workdir/missing_g2p_${var_counts}.txt | sort | uniq > $outlexicon diff --git a/egs/multi_en/s5/local/g2p/train_g2p.sh b/egs/multi_en/s5/local/g2p/train_g2p.sh deleted file mode 100755 index 43e75f6608d..00000000000 --- a/egs/multi_en/s5/local/g2p/train_g2p.sh +++ /dev/null @@ -1,67 +0,0 @@ -#!/bin/bash - -# Copyright 2017 Intellisist, Inc. (Author: Navneeth K) -# 2017 Xiaohui Zhang -# Apache License 2.0 - -# This script trains a g2p model using Phonetisaurus and SRILM. - -stage=0 -silence_phones= - -echo "$0 $@" # Print the command line for logging - -[ -f ./path.sh ] && . ./path.sh; # source the path. -. utils/parse_options.sh || exit 1; - - -if [ $# -ne 2 ]; then - echo "Usage: $0 " - exit 1; -fi - -lexicondir=$1 -outdir=$2 - -[ ! -f $lexicondir/lexicon.txt ] && echo "Cannot find $lexicondir/lexicon.txt" && exit - -isuconv=`which uconv` -if [ -z $isuconv ]; then - echo "uconv was not found. You must install the icu4c package." - exit 1; -fi - -mkdir -p $outdir - - -# For input lexicon, remove pronunciations containing non-utf-8-encodable characters, -# and optionally remove words that are mapped to a single silence phone from the lexicon. -if [ $stage -le 0 ]; then - lexicon=$lexicondir/lexicon.txt - if [ ! -z "$silence_phones" ]; then - awk 'NR==FNR{a[$1] = 1; next} {s=$2;for(i=3;i<=NF;i++) s=s" "$i; if(!(s in a)) print $1" "s}' \ - $silence_phones $lexicon | \ - awk '{printf("%s\t",$1); for (i=2;i 0'> $outdir/lexicon_tab_separated.txt - else - awk '{printf("%s\t",$1); for (i=2;i 0'> $outdir/lexicon_tab_separated.txt - fi -fi - -if [ $stage -le 1 ]; then - # Align lexicon stage. Lexicon is assumed to have first column tab separated - phonetisaurus-align --input=$outdir/lexicon_tab_separated.txt --ofile=${outdir}/aligned_lexicon.corpus || exit 1; -fi - -if [ $stage -le 2 ]; then - # Convert aligned lexicon to arpa using srilm. - ngram-count -order 7 -kn-modify-counts-at-end -gt1min 0 -gt2min 0 \ - -gt3min 0 -gt4min 0 -gt5min 0 -gt6min 0 -gt7min 0 -ukndiscount \ - -text ${outdir}/aligned_lexicon.corpus -lm ${outdir}/aligned_lexicon.arpa -fi - -if [ $stage -le 3 ]; then - # Convert the arpa file to FST. - phonetisaurus-arpa2wfst --lm=${outdir}/aligned_lexicon.arpa --ofile=${outdir}/model.fst -fi diff --git a/egs/multi_en/s5/run.sh b/egs/multi_en/s5/run.sh index c3eb3503237..f2d29e8de32 100755 --- a/egs/multi_en/s5/run.sh +++ b/egs/multi_en/s5/run.sh @@ -137,8 +137,6 @@ if [ $stage -le 4 ]; then cat "$lexicon" $g2p_tmp_dir/missing_lexicon.txt | sort | uniq > $expanded_lexicon fi -exit 0 - # We'll do multiple iterations of pron/sil-prob estimation. So the structure of # the dict/lang dirs are designed as ${dict/lang_root}_${dict_affix}, where dict_affix # is "nosp" or the name of the acoustic model we use to estimate pron/sil-probs. diff --git a/egs/wsj/s5/steps/dict/train_g2p_phonetisaurus.sh b/egs/wsj/s5/steps/dict/train_g2p_phonetisaurus.sh index 55eab2748f6..4a0bcb88024 100755 --- a/egs/wsj/s5/steps/dict/train_g2p_phonetisaurus.sh +++ b/egs/wsj/s5/steps/dict/train_g2p_phonetisaurus.sh @@ -72,7 +72,7 @@ fi if [ $stage -le 2 ]; then # Convert aligned lexicon to arpa using make_kn_lm.py, a re-implementation of srilm's ngram-count functionality. - ./steps/dict/make_kn_lm.py -ngram-order 7 -text ${wdir}/aligned_lexicon.corpus -lm ${wdir}/aligned_lexicon.arpa + ./utils/lang/make_kn_lm.py -ngram-order 7 -text ${wdir}/aligned_lexicon.corpus -lm ${wdir}/aligned_lexicon.arpa fi if [ $stage -le 3 ]; then diff --git a/egs/wsj/s5/steps/dict/make_kn_lm.py b/egs/wsj/s5/utils/lang/make_kn_lm.py similarity index 100% rename from egs/wsj/s5/steps/dict/make_kn_lm.py rename to egs/wsj/s5/utils/lang/make_kn_lm.py