Skip to content

Commit

Permalink
Merge pull request #1314 from yt605155624/add_new_tacotron2
Browse files Browse the repository at this point in the history
[TTS]Add new tacotron2
  • Loading branch information
zh794390558 authored Jan 19, 2022
2 parents 320bb0f + 9632381 commit 97db74c
Show file tree
Hide file tree
Showing 46 changed files with 3,224 additions and 518 deletions.
9 changes: 4 additions & 5 deletions examples/aishell3/tts3/conf/default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ fmax: 7600 # Maximum frequency of Mel basis.
n_mels: 80 # The number of mel basis.

# Only used for the model using pitch features (e.g. FastSpeech2)
f0min: 80 # Maximum f0 for pitch extraction.
f0max: 400 # Minimum f0 for pitch extraction.
f0min: 80 # Minimum f0 for pitch extraction.
f0max: 400 # Maximum f0 for pitch extraction.


###########################################################
Expand Down Expand Up @@ -64,14 +64,14 @@ model:
pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder
stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder
energy_predictor_layers: 2 # number of conv layers in energy predictor
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor
energy_predictor_dropout: 0.5 # dropout rate in energy predictor
energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy
energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy
stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder
spk_embed_dim: 256 # speaker embedding dimension
spk_embed_integration_type: concat # speaker embedding integration type

Expand All @@ -84,7 +84,6 @@ updater:
use_masking: True # whether to apply masking for padded part in loss calculation



###########################################################
# OPTIMIZER SETTING #
###########################################################
Expand Down
8 changes: 4 additions & 4 deletions examples/aishell3/vc1/conf/default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ fmax: 7600 # Maximum frequency of Mel basis.
n_mels: 80 # The number of mel basis.

# Only used for the model using pitch features (e.g. FastSpeech2)
f0min: 80 # Maximum f0 for pitch extraction.
f0max: 400 # Minimum f0 for pitch extraction.
f0min: 80 # Minimum f0 for pitch extraction.
f0max: 400 # Maximum f0 for pitch extraction.


###########################################################
Expand Down Expand Up @@ -64,14 +64,14 @@ model:
pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor
pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
stop_gradient_from_pitch_predictor: true # whether to stop the gradient from pitch predictor to encoder
stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder
energy_predictor_layers: 2 # number of conv layers in energy predictor
energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor
energy_predictor_dropout: 0.5 # dropout rate in energy predictor
energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy
energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy
stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder
spk_embed_dim: 256 # speaker embedding dimension
spk_embed_integration_type: concat # speaker embedding integration type

Expand Down
6 changes: 3 additions & 3 deletions examples/aishell3/voc1/conf/default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ generator_params:
aux_context_window: 2 # Context window size for auxiliary feature.
# If set to 2, previous 2 and future 2 frames will be considered.
dropout: 0.0 # Dropout rate. 0.0 means no dropout applied.
use_weight_norm: true # Whether to use weight norm.
use_weight_norm: True # Whether to use weight norm.
# If set to true, it will be applied to all of the conv layers.
upsample_scales: [4, 5, 3, 5] # Upsampling scales. prod(upsample_scales) == n_shift

Expand All @@ -46,8 +46,8 @@ discriminator_params:
kernel_size: 3 # Number of output channels.
layers: 10 # Number of conv layers.
conv_channels: 64 # Number of chnn layers.
bias: true # Whether to use bias parameter in conv.
use_weight_norm: true # Whether to use weight norm.
bias: True # Whether to use bias parameter in conv.
use_weight_norm: True # Whether to use weight norm.
# If set to true, it will be applied to all of the conv layers.
nonlinear_activation: "leakyrelu" # Nonlinear function after each conv.
nonlinear_activation_params: # Nonlinear function parameters
Expand Down
91 changes: 91 additions & 0 deletions examples/csmsc/tts0/conf/default.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
# This configuration is for Paddle to train Tacotron 2. Compared to the
# original paper, this configuration additionally use the guided attention
# loss to accelerate the learning of the diagonal attention. It requires
# only a single GPU with 12 GB memory and it takes ~1 days to finish the
# training on Titan V.

###########################################################
# FEATURE EXTRACTION SETTING #
###########################################################

fs: 24000 # sr
n_fft: 2048 # FFT size (samples).
n_shift: 300 # Hop size (samples). 12.5ms
win_length: 1200 # Window length (samples). 50ms
# If set to null, it will be the same as fft_size.
window: "hann" # Window function.

# Only used for feats_type != raw

fmin: 80 # Minimum frequency of Mel basis.
fmax: 7600 # Maximum frequency of Mel basis.
n_mels: 80 # The number of mel basis.

###########################################################
# DATA SETTING #
###########################################################
batch_size: 64
num_workers: 2

###########################################################
# MODEL SETTING #
###########################################################
model: # keyword arguments for the selected model
embed_dim: 512 # char or phn embedding dimension
elayers: 1 # number of blstm layers in encoder
eunits: 512 # number of blstm units
econv_layers: 3 # number of convolutional layers in encoder
econv_chans: 512 # number of channels in convolutional layer
econv_filts: 5 # filter size of convolutional layer
atype: location # attention function type
adim: 512 # attention dimension
aconv_chans: 32 # number of channels in convolutional layer of attention
aconv_filts: 15 # filter size of convolutional layer of attention
cumulate_att_w: True # whether to cumulate attention weight
dlayers: 2 # number of lstm layers in decoder
dunits: 1024 # number of lstm units in decoder
prenet_layers: 2 # number of layers in prenet
prenet_units: 256 # number of units in prenet
postnet_layers: 5 # number of layers in postnet
postnet_chans: 512 # number of channels in postnet
postnet_filts: 5 # filter size of postnet layer
output_activation: null # activation function for the final output
use_batch_norm: True # whether to use batch normalization in encoder
use_concate: True # whether to concatenate encoder embedding with decoder outputs
use_residual: False # whether to use residual connection in encoder
dropout_rate: 0.5 # dropout rate
zoneout_rate: 0.1 # zoneout rate
reduction_factor: 1 # reduction factor
spk_embed_dim: null # speaker embedding dimension


###########################################################
# UPDATER SETTING #
###########################################################
updater:
use_masking: True # whether to apply masking for padded part in loss calculation
bce_pos_weight: 5.0 # weight of positive sample in binary cross entropy calculation
use_guided_attn_loss: True # whether to use guided attention loss
guided_attn_loss_sigma: 0.4 # sigma of guided attention loss
guided_attn_loss_lambda: 1.0 # strength of guided attention loss


##########################################################
# OPTIMIZER SETTING #
##########################################################
optimizer:
optim: adam # optimizer type
learning_rate: 1.0e-03 # learning rate
epsilon: 1.0e-06 # epsilon
weight_decay: 0.0 # weight decay coefficient

###########################################################
# TRAINING SETTING #
###########################################################
max_epoch: 200
num_snapshots: 5

###########################################################
# OTHER SETTING #
###########################################################
seed: 42
62 changes: 62 additions & 0 deletions examples/csmsc/tts0/local/preprocess.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#!/bin/bash

stage=0
stop_stage=100

config_path=$1

if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# get durations from MFA's result
echo "Generate durations.txt from MFA results ..."
python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
--inputdir=./baker_alignment_tone \
--output=durations.txt \
--config=${config_path}
fi

if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# extract features
echo "Extract features ..."
python3 ${BIN_DIR}/preprocess.py \
--dataset=baker \
--rootdir=~/datasets/BZNSYP/ \
--dumpdir=dump \
--dur-file=durations.txt \
--config=${config_path} \
--num-cpu=20 \
--cut-sil=True
fi

if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# get features' stats(mean and std)
echo "Get features' stats ..."
python3 ${MAIN_ROOT}/utils/compute_statistics.py \
--metadata=dump/train/raw/metadata.jsonl \
--field-name="speech"

fi

if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# normalize and covert phone to id, dev and test should use train's stats
echo "Normalize ..."
python3 ${BIN_DIR}/normalize.py \
--metadata=dump/train/raw/metadata.jsonl \
--dumpdir=dump/train/norm \
--speech-stats=dump/train/speech_stats.npy \
--phones-dict=dump/phone_id_map.txt \
--speaker-dict=dump/speaker_id_map.txt

python3 ${BIN_DIR}/normalize.py \
--metadata=dump/dev/raw/metadata.jsonl \
--dumpdir=dump/dev/norm \
--speech-stats=dump/train/speech_stats.npy \
--phones-dict=dump/phone_id_map.txt \
--speaker-dict=dump/speaker_id_map.txt

python3 ${BIN_DIR}/normalize.py \
--metadata=dump/test/raw/metadata.jsonl \
--dumpdir=dump/test/norm \
--speech-stats=dump/train/speech_stats.npy \
--phones-dict=dump/phone_id_map.txt \
--speaker-dict=dump/speaker_id_map.txt
fi
20 changes: 20 additions & 0 deletions examples/csmsc/tts0/local/synthesize.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#!/bin/bash

config_path=$1
train_output_path=$2
ckpt_name=$3

FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/../synthesize.py \
--am=tacotron2_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/speech_stats.npy \
--voc=pwgan_csmsc \
--voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
--voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
--voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt
91 changes: 91 additions & 0 deletions examples/csmsc/tts0/local/synthesize_e2e.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
#!/bin/bash

config_path=$1
train_output_path=$2
ckpt_name=$3

stage=0
stop_stage=0

if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/../synthesize_e2e.py \
--am=tacotron2_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/speech_stats.npy \
--voc=pwgan_csmsc \
--voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
--voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
--voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--inference_dir=${train_output_path}/inference \
--phones_dict=dump/phone_id_map.txt
fi

# for more GAN Vocoders
# multi band melgan
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/../synthesize_e2e.py \
--am=fastspeech2_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/speech_stats.npy \
--voc=mb_melgan_csmsc \
--voc_config=mb_melgan_baker_finetune_ckpt_0.5/finetune.yaml \
--voc_ckpt=mb_melgan_baker_finetune_ckpt_0.5/snapshot_iter_2000000.pdz\
--voc_stat=mb_melgan_baker_finetune_ckpt_0.5/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--inference_dir=${train_output_path}/inference \
--phones_dict=dump/phone_id_map.txt
fi

# the pretrained models haven't release now
# style melgan
# style melgan's Dygraph to Static Graph is not ready now
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/../synthesize_e2e.py \
--am=fastspeech2_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/speech_stats.npy \
--voc=style_melgan_csmsc \
--voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
--voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--phones_dict=dump/phone_id_map.txt
# --inference_dir=${train_output_path}/inference
fi

# hifigan
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "in hifigan syn_e2e"
FLAGS_allocator_strategy=naive_best_fit \
FLAGS_fraction_of_gpu_memory_to_use=0.01 \
python3 ${BIN_DIR}/../synthesize_e2e.py \
--am=fastspeech2_csmsc \
--am_config=${config_path} \
--am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
--am_stat=dump/train/speech_stats.npy \
--voc=hifigan_csmsc \
--voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
--voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
--voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
--lang=zh \
--text=${BIN_DIR}/../sentences.txt \
--output_dir=${train_output_path}/test_e2e \
--inference_dir=${train_output_path}/inference \
--phones_dict=dump/phone_id_map.txt
fi
12 changes: 12 additions & 0 deletions examples/csmsc/tts0/local/train.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/bin/bash

config_path=$1
train_output_path=$2

python3 ${BIN_DIR}/train.py \
--train-metadata=dump/train/norm/metadata.jsonl \
--dev-metadata=dump/dev/norm/metadata.jsonl \
--config=${config_path} \
--output-dir=${train_output_path} \
--ngpu=1 \
--phones-dict=dump/phone_id_map.txt
13 changes: 13 additions & 0 deletions examples/csmsc/tts0/path.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/bin/bash
export MAIN_ROOT=`realpath ${PWD}/../../../`

export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
export LC_ALL=C

export PYTHONDONTWRITEBYTECODE=1
# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}

MODEL=new_tacotron2
export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
Loading

0 comments on commit 97db74c

Please sign in to comment.