-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1314 from yt605155624/add_new_tacotron2
[TTS]Add new tacotron2
- Loading branch information
Showing
46 changed files
with
3,224 additions
and
518 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
# This configuration is for Paddle to train Tacotron 2. Compared to the | ||
# original paper, this configuration additionally use the guided attention | ||
# loss to accelerate the learning of the diagonal attention. It requires | ||
# only a single GPU with 12 GB memory and it takes ~1 days to finish the | ||
# training on Titan V. | ||
|
||
########################################################### | ||
# FEATURE EXTRACTION SETTING # | ||
########################################################### | ||
|
||
fs: 24000 # sr | ||
n_fft: 2048 # FFT size (samples). | ||
n_shift: 300 # Hop size (samples). 12.5ms | ||
win_length: 1200 # Window length (samples). 50ms | ||
# If set to null, it will be the same as fft_size. | ||
window: "hann" # Window function. | ||
|
||
# Only used for feats_type != raw | ||
|
||
fmin: 80 # Minimum frequency of Mel basis. | ||
fmax: 7600 # Maximum frequency of Mel basis. | ||
n_mels: 80 # The number of mel basis. | ||
|
||
########################################################### | ||
# DATA SETTING # | ||
########################################################### | ||
batch_size: 64 | ||
num_workers: 2 | ||
|
||
########################################################### | ||
# MODEL SETTING # | ||
########################################################### | ||
model: # keyword arguments for the selected model | ||
embed_dim: 512 # char or phn embedding dimension | ||
elayers: 1 # number of blstm layers in encoder | ||
eunits: 512 # number of blstm units | ||
econv_layers: 3 # number of convolutional layers in encoder | ||
econv_chans: 512 # number of channels in convolutional layer | ||
econv_filts: 5 # filter size of convolutional layer | ||
atype: location # attention function type | ||
adim: 512 # attention dimension | ||
aconv_chans: 32 # number of channels in convolutional layer of attention | ||
aconv_filts: 15 # filter size of convolutional layer of attention | ||
cumulate_att_w: True # whether to cumulate attention weight | ||
dlayers: 2 # number of lstm layers in decoder | ||
dunits: 1024 # number of lstm units in decoder | ||
prenet_layers: 2 # number of layers in prenet | ||
prenet_units: 256 # number of units in prenet | ||
postnet_layers: 5 # number of layers in postnet | ||
postnet_chans: 512 # number of channels in postnet | ||
postnet_filts: 5 # filter size of postnet layer | ||
output_activation: null # activation function for the final output | ||
use_batch_norm: True # whether to use batch normalization in encoder | ||
use_concate: True # whether to concatenate encoder embedding with decoder outputs | ||
use_residual: False # whether to use residual connection in encoder | ||
dropout_rate: 0.5 # dropout rate | ||
zoneout_rate: 0.1 # zoneout rate | ||
reduction_factor: 1 # reduction factor | ||
spk_embed_dim: null # speaker embedding dimension | ||
|
||
|
||
########################################################### | ||
# UPDATER SETTING # | ||
########################################################### | ||
updater: | ||
use_masking: True # whether to apply masking for padded part in loss calculation | ||
bce_pos_weight: 5.0 # weight of positive sample in binary cross entropy calculation | ||
use_guided_attn_loss: True # whether to use guided attention loss | ||
guided_attn_loss_sigma: 0.4 # sigma of guided attention loss | ||
guided_attn_loss_lambda: 1.0 # strength of guided attention loss | ||
|
||
|
||
########################################################## | ||
# OPTIMIZER SETTING # | ||
########################################################## | ||
optimizer: | ||
optim: adam # optimizer type | ||
learning_rate: 1.0e-03 # learning rate | ||
epsilon: 1.0e-06 # epsilon | ||
weight_decay: 0.0 # weight decay coefficient | ||
|
||
########################################################### | ||
# TRAINING SETTING # | ||
########################################################### | ||
max_epoch: 200 | ||
num_snapshots: 5 | ||
|
||
########################################################### | ||
# OTHER SETTING # | ||
########################################################### | ||
seed: 42 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
#!/bin/bash | ||
|
||
stage=0 | ||
stop_stage=100 | ||
|
||
config_path=$1 | ||
|
||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then | ||
# get durations from MFA's result | ||
echo "Generate durations.txt from MFA results ..." | ||
python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \ | ||
--inputdir=./baker_alignment_tone \ | ||
--output=durations.txt \ | ||
--config=${config_path} | ||
fi | ||
|
||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then | ||
# extract features | ||
echo "Extract features ..." | ||
python3 ${BIN_DIR}/preprocess.py \ | ||
--dataset=baker \ | ||
--rootdir=~/datasets/BZNSYP/ \ | ||
--dumpdir=dump \ | ||
--dur-file=durations.txt \ | ||
--config=${config_path} \ | ||
--num-cpu=20 \ | ||
--cut-sil=True | ||
fi | ||
|
||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then | ||
# get features' stats(mean and std) | ||
echo "Get features' stats ..." | ||
python3 ${MAIN_ROOT}/utils/compute_statistics.py \ | ||
--metadata=dump/train/raw/metadata.jsonl \ | ||
--field-name="speech" | ||
|
||
fi | ||
|
||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then | ||
# normalize and covert phone to id, dev and test should use train's stats | ||
echo "Normalize ..." | ||
python3 ${BIN_DIR}/normalize.py \ | ||
--metadata=dump/train/raw/metadata.jsonl \ | ||
--dumpdir=dump/train/norm \ | ||
--speech-stats=dump/train/speech_stats.npy \ | ||
--phones-dict=dump/phone_id_map.txt \ | ||
--speaker-dict=dump/speaker_id_map.txt | ||
|
||
python3 ${BIN_DIR}/normalize.py \ | ||
--metadata=dump/dev/raw/metadata.jsonl \ | ||
--dumpdir=dump/dev/norm \ | ||
--speech-stats=dump/train/speech_stats.npy \ | ||
--phones-dict=dump/phone_id_map.txt \ | ||
--speaker-dict=dump/speaker_id_map.txt | ||
|
||
python3 ${BIN_DIR}/normalize.py \ | ||
--metadata=dump/test/raw/metadata.jsonl \ | ||
--dumpdir=dump/test/norm \ | ||
--speech-stats=dump/train/speech_stats.npy \ | ||
--phones-dict=dump/phone_id_map.txt \ | ||
--speaker-dict=dump/speaker_id_map.txt | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
#!/bin/bash | ||
|
||
config_path=$1 | ||
train_output_path=$2 | ||
ckpt_name=$3 | ||
|
||
FLAGS_allocator_strategy=naive_best_fit \ | ||
FLAGS_fraction_of_gpu_memory_to_use=0.01 \ | ||
python3 ${BIN_DIR}/../synthesize.py \ | ||
--am=tacotron2_csmsc \ | ||
--am_config=${config_path} \ | ||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ | ||
--am_stat=dump/train/speech_stats.npy \ | ||
--voc=pwgan_csmsc \ | ||
--voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \ | ||
--voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ | ||
--voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ | ||
--test_metadata=dump/test/norm/metadata.jsonl \ | ||
--output_dir=${train_output_path}/test \ | ||
--phones_dict=dump/phone_id_map.txt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
#!/bin/bash | ||
|
||
config_path=$1 | ||
train_output_path=$2 | ||
ckpt_name=$3 | ||
|
||
stage=0 | ||
stop_stage=0 | ||
|
||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then | ||
FLAGS_allocator_strategy=naive_best_fit \ | ||
FLAGS_fraction_of_gpu_memory_to_use=0.01 \ | ||
python3 ${BIN_DIR}/../synthesize_e2e.py \ | ||
--am=tacotron2_csmsc \ | ||
--am_config=${config_path} \ | ||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ | ||
--am_stat=dump/train/speech_stats.npy \ | ||
--voc=pwgan_csmsc \ | ||
--voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \ | ||
--voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \ | ||
--voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \ | ||
--lang=zh \ | ||
--text=${BIN_DIR}/../sentences.txt \ | ||
--output_dir=${train_output_path}/test_e2e \ | ||
--inference_dir=${train_output_path}/inference \ | ||
--phones_dict=dump/phone_id_map.txt | ||
fi | ||
|
||
# for more GAN Vocoders | ||
# multi band melgan | ||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then | ||
FLAGS_allocator_strategy=naive_best_fit \ | ||
FLAGS_fraction_of_gpu_memory_to_use=0.01 \ | ||
python3 ${BIN_DIR}/../synthesize_e2e.py \ | ||
--am=fastspeech2_csmsc \ | ||
--am_config=${config_path} \ | ||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ | ||
--am_stat=dump/train/speech_stats.npy \ | ||
--voc=mb_melgan_csmsc \ | ||
--voc_config=mb_melgan_baker_finetune_ckpt_0.5/finetune.yaml \ | ||
--voc_ckpt=mb_melgan_baker_finetune_ckpt_0.5/snapshot_iter_2000000.pdz\ | ||
--voc_stat=mb_melgan_baker_finetune_ckpt_0.5/feats_stats.npy \ | ||
--lang=zh \ | ||
--text=${BIN_DIR}/../sentences.txt \ | ||
--output_dir=${train_output_path}/test_e2e \ | ||
--inference_dir=${train_output_path}/inference \ | ||
--phones_dict=dump/phone_id_map.txt | ||
fi | ||
|
||
# the pretrained models haven't release now | ||
# style melgan | ||
# style melgan's Dygraph to Static Graph is not ready now | ||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then | ||
FLAGS_allocator_strategy=naive_best_fit \ | ||
FLAGS_fraction_of_gpu_memory_to_use=0.01 \ | ||
python3 ${BIN_DIR}/../synthesize_e2e.py \ | ||
--am=fastspeech2_csmsc \ | ||
--am_config=${config_path} \ | ||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ | ||
--am_stat=dump/train/speech_stats.npy \ | ||
--voc=style_melgan_csmsc \ | ||
--voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \ | ||
--voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \ | ||
--voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \ | ||
--lang=zh \ | ||
--text=${BIN_DIR}/../sentences.txt \ | ||
--output_dir=${train_output_path}/test_e2e \ | ||
--phones_dict=dump/phone_id_map.txt | ||
# --inference_dir=${train_output_path}/inference | ||
fi | ||
|
||
# hifigan | ||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then | ||
echo "in hifigan syn_e2e" | ||
FLAGS_allocator_strategy=naive_best_fit \ | ||
FLAGS_fraction_of_gpu_memory_to_use=0.01 \ | ||
python3 ${BIN_DIR}/../synthesize_e2e.py \ | ||
--am=fastspeech2_csmsc \ | ||
--am_config=${config_path} \ | ||
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \ | ||
--am_stat=dump/train/speech_stats.npy \ | ||
--voc=hifigan_csmsc \ | ||
--voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \ | ||
--voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \ | ||
--voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \ | ||
--lang=zh \ | ||
--text=${BIN_DIR}/../sentences.txt \ | ||
--output_dir=${train_output_path}/test_e2e \ | ||
--inference_dir=${train_output_path}/inference \ | ||
--phones_dict=dump/phone_id_map.txt | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
#!/bin/bash | ||
|
||
config_path=$1 | ||
train_output_path=$2 | ||
|
||
python3 ${BIN_DIR}/train.py \ | ||
--train-metadata=dump/train/norm/metadata.jsonl \ | ||
--dev-metadata=dump/dev/norm/metadata.jsonl \ | ||
--config=${config_path} \ | ||
--output-dir=${train_output_path} \ | ||
--ngpu=1 \ | ||
--phones-dict=dump/phone_id_map.txt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
#!/bin/bash | ||
export MAIN_ROOT=`realpath ${PWD}/../../../` | ||
|
||
export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH} | ||
export LC_ALL=C | ||
|
||
export PYTHONDONTWRITEBYTECODE=1 | ||
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C | ||
export PYTHONIOENCODING=UTF-8 | ||
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} | ||
|
||
MODEL=new_tacotron2 | ||
export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL} |
Oops, something went wrong.