Skip to content

Commit

Permalink
add phonetisaurus-based g2p
Browse files Browse the repository at this point in the history
  • Loading branch information
huangruizhe committed Sep 21, 2018
1 parent ebbae44 commit 61d9560
Show file tree
Hide file tree
Showing 4 changed files with 540 additions and 4 deletions.
29 changes: 25 additions & 4 deletions egs/multi_en/s5/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,8 @@ if [ $stage -le 1 ]; then
# We prepare the basic dictionary in data/local/dict_combined.
local/prepare_dict.sh $swbd $tedlium2
(
local/g2p/train_g2p.sh --stage 0 --silence-phones \
"data/local/dict_combined/silence_phones.txt" data/local/dict_combined exp/g2p || touch exp/g2p/.error
steps/dict/train_g2p_phonetisaurus.sh --stage 0 --silence-phones \
"data/local/dict_combined/silence_phones.txt" data/local/dict_combined/lexicon.txt exp/g2p || touch exp/g2p/.error
) &
fi

Expand Down Expand Up @@ -114,10 +114,31 @@ if [ $stage -le 4 ]; then
mkdir -p $dict_dir
rm $dict_dir/lexiconp.txt 2>/dev/null || true
cp data/local/dict_combined/{extra_questions,nonsilence_phones,silence_phones,optional_silence}.txt $dict_dir
local/g2p/apply_g2p.sh --var-counts 1 exp/g2p/model.fst data/local/g2p_phonetisarus \
data/local/dict_combined/lexicon.txt $dict_dir/lexicon.txt || exit 1;

# awk command from http://stackoverflow.com/questions/2626274/print-all-but-the-first-three-columns
echo 'Gathering missing words...'

lexicon=data/local/dict_combined/lexicon.txt
g2p_tmp_dir=data/local/g2p_phonetisarus
mkdir -p $g2p_tmp_dir

cat data/*/train/text | \
local/count_oovs.pl $lexicon | \
awk '{if (NF > 3 ) {for(i=4; i<NF; i++) printf "%s ",$i; print $NF;}}' | \
perl -ape 's/\s/\n/g;' | \
sort | uniq > $g2p_tmp_dir/missing.txt
cat $g2p_tmp_dir/missing.txt | \
grep "^[a-z]*$" > $g2p_tmp_dir/missing_onlywords.txt

steps/dict/apply_g2p_phonetisaurus.sh --nbest 1 exp/g2p/model.fst $g2p_tmp_dir/missing_onlywords.txt $g2p_tmp_dir/missing_lexicon.txt || exit 1;

expanded_lexicon=$dict_dir/lexicon.txt
echo "Adding new pronunciations to get expanded lexicon $expanded_lexicon"
cat "$lexicon" $g2p_tmp_dir/missing_lexicon.txt | sort | uniq > $expanded_lexicon
fi

exit 0

# We'll do multiple iterations of pron/sil-prob estimation. So the structure of
# the dict/lang dirs are designed as ${dict/lang_root}_${dict_affix}, where dict_affix
# is "nosp" or the name of the acoustic model we use to estimate pron/sil-probs.
Expand Down
56 changes: 56 additions & 0 deletions egs/wsj/s5/steps/dict/apply_g2p_phonetisaurus.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#!/bin/bash
# Copyright 2014 Johns Hopkins University (Author: Yenda Trmal)
# Copyright 2016 Xiaohui Zhang
# 2018 Ruizhe Huang
# Apache 2.0

# This script applies a trained Phonetisarus G2P model to
# synthesize pronunciations for missing words (i.e., words in
# transcripts but not the lexicon), and output the expanded lexicon.

# Begin configuration section.
stage=0
nbest=1 # Generate up to N variants
pmass= # Generate so many variants to produce 90 % of the prob mass
model=
# End configuration section.

echo "$0 $@" # Print the command line for logging

[ -f ./path.sh ] && . ./path.sh; # source the path.
. utils/parse_options.sh || exit 1;

set -u
set -e

if [ $# != 3 ]; then
echo "Usage: $0 [options] <g2p-model> <word-list> <lexicon-out>"
echo "... where <g2p-model> is the trained g2p model"
echo " <word-list> is a list of words whose pronunciation is to be generated"
echo " <lexicon-out> output lexicon, whose format is ...." # TODO
echo "e.g.: $0 exp/g2p/model.fst exp/g2p/oov_words.txt exp/g2p/model.fst data/local/dict_nosp/lexicon.txt"
echo ""
echo "main options (for others, see top of script file)"
echo " --nbest <int> # Maximum number of hypotheses to produce. By default, nbest=1."
echo " --pmass <float> # Select the maximum number of hypotheses summing to pmass total mass"
echo " # for a word. By default, pmass is disabled."
exit 1;
fi

model=$1
word_list=$2
out_lexicon=$3


if [ -z $pmass ]; then
echo "Synthesizing pronunciations for words in $word_list based on nbest = $nbest"
option="--nbest $nbest"
else
echo "Synthesizing pronunciations for words in $word_list based on pmass = $pmass"
option="--pmass $pmass"
fi
phonetisaurus-apply $option --model $model --thresh 5 --accumulate --word_list $word_list > $out_lexicon

echo "Finished. Synthesized lexicon for new words is in $out_lexicon"

exit 0
Loading

0 comments on commit 61d9560

Please sign in to comment.