Merge pull request #1314 from yt605155624/add_new_tacotron2

[TTS]Add new tacotron2
PaddlePaddle · Jan 19, 2022 · 97db74c · 97db74c
2 parents 320bb0f + 9632381
commit 97db74c
Show file tree

Hide file tree

Showing 46 changed files with 3,224 additions and 518 deletions.
diff --git a/examples/aishell3/tts3/conf/default.yaml b/examples/aishell3/tts3/conf/default.yaml
@@ -16,8 +16,8 @@ fmax: 7600         # Maximum frequency of Mel basis.
 n_mels: 80         # The number of mel basis.
 
 # Only used for the model using pitch features (e.g. FastSpeech2)
-f0min: 80          # Maximum f0 for pitch extraction.
-f0max: 400         # Minimum f0 for pitch extraction.
+f0min: 80          # Minimum f0 for pitch extraction.
+f0max: 400         # Maximum f0 for pitch extraction.
 
 
 ###########################################################
@@ -64,14 +64,14 @@ model:
     pitch_predictor_dropout: 0.5               # dropout rate in pitch predictor
     pitch_embed_kernel_size: 1                 # kernel size of conv embedding layer for pitch
     pitch_embed_dropout: 0.0                   # dropout rate after conv embedding layer for pitch
-    stop_gradient_from_pitch_predictor: true   # whether to stop the gradient from pitch predictor to encoder
+    stop_gradient_from_pitch_predictor: True   # whether to stop the gradient from pitch predictor to encoder
     energy_predictor_layers: 2                 # number of conv layers in energy predictor
     energy_predictor_chans: 256                # number of channels of conv layers in energy predictor
     energy_predictor_kernel_size: 3            # kernel size of conv leyers in energy predictor
     energy_predictor_dropout: 0.5              # dropout rate in energy predictor
     energy_embed_kernel_size: 1                # kernel size of conv embedding layer for energy
     energy_embed_dropout: 0.0                  # dropout rate after conv embedding layer for energy
-    stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
+    stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder
     spk_embed_dim: 256                         # speaker embedding dimension
     spk_embed_integration_type: concat         # speaker embedding integration type
 
@@ -84,7 +84,6 @@ updater:
     use_masking: True                 # whether to apply masking for padded part in loss calculation
 
 
-
 ###########################################################
 #                     OPTIMIZER SETTING                   #
 ###########################################################

diff --git a/examples/aishell3/vc1/conf/default.yaml b/examples/aishell3/vc1/conf/default.yaml
@@ -16,8 +16,8 @@ fmax: 7600         # Maximum frequency of Mel basis.
 n_mels: 80         # The number of mel basis.
 
 # Only used for the model using pitch features (e.g. FastSpeech2)
-f0min: 80          # Maximum f0 for pitch extraction.
-f0max: 400         # Minimum f0 for pitch extraction.
+f0min: 80          # Minimum f0 for pitch extraction.
+f0max: 400         # Maximum f0 for pitch extraction.
 
 
 ###########################################################
@@ -64,14 +64,14 @@ model:
     pitch_predictor_dropout: 0.5               # dropout rate in pitch predictor
     pitch_embed_kernel_size: 1                 # kernel size of conv embedding layer for pitch
     pitch_embed_dropout: 0.0                   # dropout rate after conv embedding layer for pitch
-    stop_gradient_from_pitch_predictor: true   # whether to stop the gradient from pitch predictor to encoder
+    stop_gradient_from_pitch_predictor: True   # whether to stop the gradient from pitch predictor to encoder
     energy_predictor_layers: 2                 # number of conv layers in energy predictor
     energy_predictor_chans: 256                # number of channels of conv layers in energy predictor
     energy_predictor_kernel_size: 3            # kernel size of conv leyers in energy predictor
     energy_predictor_dropout: 0.5              # dropout rate in energy predictor
     energy_embed_kernel_size: 1                # kernel size of conv embedding layer for energy
     energy_embed_dropout: 0.0                  # dropout rate after conv embedding layer for energy
-    stop_gradient_from_energy_predictor: false # whether to stop the gradient from energy predictor to encoder
+    stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder
     spk_embed_dim: 256                         # speaker embedding dimension
     spk_embed_integration_type: concat         # speaker embedding integration type
 

diff --git a/examples/aishell3/voc1/conf/default.yaml b/examples/aishell3/voc1/conf/default.yaml
@@ -33,7 +33,7 @@ generator_params:
     aux_context_window: 2 # Context window size for auxiliary feature.
                           # If set to 2, previous 2 and future 2 frames will be considered.
     dropout: 0.0          # Dropout rate. 0.0 means no dropout applied.
-    use_weight_norm: true # Whether to use weight norm.
+    use_weight_norm: True # Whether to use weight norm.
                           # If set to true, it will be applied to all of the conv layers.
     upsample_scales: [4, 5, 3, 5]     # Upsampling scales. prod(upsample_scales) == n_shift
 
@@ -46,8 +46,8 @@ discriminator_params:
     kernel_size: 3        # Number of output channels.
     layers: 10            # Number of conv layers.
     conv_channels: 64     # Number of chnn layers.
-    bias: true            # Whether to use bias parameter in conv.
-    use_weight_norm: true # Whether to use weight norm.
+    bias: True            # Whether to use bias parameter in conv.
+    use_weight_norm: True # Whether to use weight norm.
                           # If set to true, it will be applied to all of the conv layers.
     nonlinear_activation: "leakyrelu" # Nonlinear function after each conv.
     nonlinear_activation_params:      # Nonlinear function parameters

diff --git a/examples/csmsc/tts0/conf/default.yaml b/examples/csmsc/tts0/conf/default.yaml
@@ -0,0 +1,91 @@
+# This configuration is for Paddle to train Tacotron 2. Compared to the
+# original paper, this configuration additionally use the guided attention
+# loss to accelerate the learning of the diagonal attention. It requires
+# only a single GPU with 12 GB memory and it takes ~1 days to finish the
+# training on Titan V.
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+
+fs: 24000          # sr
+n_fft: 2048        # FFT size (samples).
+n_shift: 300       # Hop size (samples). 12.5ms
+win_length: 1200   # Window length (samples). 50ms
+                   # If set to null, it will be the same as fft_size.
+window: "hann"     # Window function.
+
+# Only used for feats_type != raw
+
+fmin: 80           # Minimum frequency of Mel basis.
+fmax: 7600         # Maximum frequency of Mel basis.
+n_mels: 80         # The number of mel basis.
+
+###########################################################
+#                       DATA SETTING                      #
+###########################################################
+batch_size: 64
+num_workers: 2
+
+###########################################################
+#                       MODEL SETTING                     #
+###########################################################
+model:                          # keyword arguments for the selected model
+    embed_dim: 512               # char or phn embedding dimension
+    elayers: 1                   # number of blstm layers in encoder
+    eunits: 512                  # number of blstm units
+    econv_layers: 3              # number of convolutional layers in encoder
+    econv_chans: 512             # number of channels in convolutional layer
+    econv_filts: 5               # filter size of convolutional layer
+    atype: location              # attention function type
+    adim: 512                    # attention dimension
+    aconv_chans: 32              # number of channels in convolutional layer of attention
+    aconv_filts: 15              # filter size of convolutional layer of attention
+    cumulate_att_w: True         # whether to cumulate attention weight
+    dlayers: 2                   # number of lstm layers in decoder
+    dunits: 1024                 # number of lstm units in decoder
+    prenet_layers: 2             # number of layers in prenet
+    prenet_units: 256            # number of units in prenet
+    postnet_layers: 5            # number of layers in postnet
+    postnet_chans: 512           # number of channels in postnet
+    postnet_filts: 5             # filter size of postnet layer
+    output_activation: null      # activation function for the final output
+    use_batch_norm: True         # whether to use batch normalization in encoder
+    use_concate: True            # whether to concatenate encoder embedding with decoder outputs
+    use_residual: False          # whether to use residual connection in encoder
+    dropout_rate: 0.5            # dropout rate
+    zoneout_rate: 0.1            # zoneout rate
+    reduction_factor: 1          # reduction factor
+    spk_embed_dim: null          # speaker embedding dimension
+
+
+###########################################################
+#                       UPDATER SETTING                   #
+###########################################################
+updater:
+    use_masking: True            # whether to apply masking for padded part in loss calculation
+    bce_pos_weight: 5.0          # weight of positive sample in binary cross entropy calculation
+    use_guided_attn_loss: True   # whether to use guided attention loss
+    guided_attn_loss_sigma: 0.4  # sigma of guided attention loss
+    guided_attn_loss_lambda: 1.0 # strength of guided attention loss
+
+
+##########################################################
+#                  OPTIMIZER SETTING                     #
+##########################################################
+optimizer:
+    optim: adam              # optimizer type
+    learning_rate: 1.0e-03   # learning rate
+    epsilon: 1.0e-06         # epsilon
+    weight_decay: 0.0        # weight decay coefficient
+
+###########################################################
+#                     TRAINING SETTING                    #
+###########################################################
+max_epoch: 200
+num_snapshots: 5
+
+###########################################################
+#                       OTHER SETTING                     #
+###########################################################
+seed: 42
diff --git a/examples/csmsc/tts0/local/preprocess.sh b/examples/csmsc/tts0/local/preprocess.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+
+stage=0
+stop_stage=100
+
+config_path=$1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # get durations from MFA's result
+    echo "Generate durations.txt from MFA results ..."
+    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
+        --inputdir=./baker_alignment_tone \
+        --output=durations.txt \
+        --config=${config_path}
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # extract features
+    echo "Extract features ..."
+    python3 ${BIN_DIR}/preprocess.py \
+        --dataset=baker \
+        --rootdir=~/datasets/BZNSYP/ \
+        --dumpdir=dump \
+        --dur-file=durations.txt \
+        --config=${config_path} \
+        --num-cpu=20 \
+        --cut-sil=True
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # get features' stats(mean and std)
+    echo "Get features' stats ..."
+    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --field-name="speech"
+
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # normalize and covert phone to id, dev and test should use train's stats
+    echo "Normalize ..."
+    python3 ${BIN_DIR}/normalize.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --dumpdir=dump/train/norm \
+        --speech-stats=dump/train/speech_stats.npy \
+        --phones-dict=dump/phone_id_map.txt \
+        --speaker-dict=dump/speaker_id_map.txt
+
+    python3 ${BIN_DIR}/normalize.py \
+        --metadata=dump/dev/raw/metadata.jsonl \
+        --dumpdir=dump/dev/norm \
+        --speech-stats=dump/train/speech_stats.npy \
+        --phones-dict=dump/phone_id_map.txt \
+        --speaker-dict=dump/speaker_id_map.txt
+
+    python3 ${BIN_DIR}/normalize.py \
+        --metadata=dump/test/raw/metadata.jsonl \
+        --dumpdir=dump/test/norm \
+        --speech-stats=dump/train/speech_stats.npy \
+        --phones-dict=dump/phone_id_map.txt \
+        --speaker-dict=dump/speaker_id_map.txt
+fi
diff --git a/examples/csmsc/tts0/local/synthesize.sh b/examples/csmsc/tts0/local/synthesize.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
+FLAGS_allocator_strategy=naive_best_fit \
+FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+python3 ${BIN_DIR}/../synthesize.py \
+    --am=tacotron2_csmsc \
+    --am_config=${config_path} \
+    --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+    --am_stat=dump/train/speech_stats.npy \
+    --voc=pwgan_csmsc \
+    --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
+    --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
+    --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
+    --test_metadata=dump/test/norm/metadata.jsonl \
+    --output_dir=${train_output_path}/test \
+    --phones_dict=dump/phone_id_map.txt
diff --git a/examples/csmsc/tts0/local/synthesize_e2e.sh b/examples/csmsc/tts0/local/synthesize_e2e.sh
@@ -0,0 +1,91 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
+stage=0
+stop_stage=0
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=tacotron2_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/speech_stats.npy \
+        --voc=pwgan_csmsc \
+        --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
+        --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
+        --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
+        --lang=zh \
+        --text=${BIN_DIR}/../sentences.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --inference_dir=${train_output_path}/inference \
+        --phones_dict=dump/phone_id_map.txt
+fi
+
+# for more GAN Vocoders
+# multi band melgan
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=fastspeech2_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/speech_stats.npy \
+        --voc=mb_melgan_csmsc \
+        --voc_config=mb_melgan_baker_finetune_ckpt_0.5/finetune.yaml \
+        --voc_ckpt=mb_melgan_baker_finetune_ckpt_0.5/snapshot_iter_2000000.pdz\
+        --voc_stat=mb_melgan_baker_finetune_ckpt_0.5/feats_stats.npy \
+        --lang=zh \
+        --text=${BIN_DIR}/../sentences.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --inference_dir=${train_output_path}/inference \
+        --phones_dict=dump/phone_id_map.txt
+fi
+
+# the pretrained models haven't release now
+# style melgan
+# style melgan's Dygraph to Static Graph is not ready now
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=fastspeech2_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/speech_stats.npy \
+        --voc=style_melgan_csmsc \
+        --voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
+        --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --lang=zh \
+        --text=${BIN_DIR}/../sentences.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --phones_dict=dump/phone_id_map.txt
+        # --inference_dir=${train_output_path}/inference
+fi
+
+# hifigan
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    echo "in hifigan syn_e2e"
+    FLAGS_allocator_strategy=naive_best_fit \
+    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=fastspeech2_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/speech_stats.npy \
+        --voc=hifigan_csmsc \
+        --voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
+        --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --lang=zh \
+        --text=${BIN_DIR}/../sentences.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --inference_dir=${train_output_path}/inference \
+        --phones_dict=dump/phone_id_map.txt
+fi
diff --git a/examples/csmsc/tts0/local/train.sh b/examples/csmsc/tts0/local/train.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+
+python3 ${BIN_DIR}/train.py \
+    --train-metadata=dump/train/norm/metadata.jsonl \
+    --dev-metadata=dump/dev/norm/metadata.jsonl \
+    --config=${config_path} \
+    --output-dir=${train_output_path} \
+    --ngpu=1 \
+    --phones-dict=dump/phone_id_map.txt
diff --git a/examples/csmsc/tts0/path.sh b/examples/csmsc/tts0/path.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+MODEL=new_tacotron2
+export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}