fix comment

PaddlePaddle · Mar 9, 2023 · 9acc852 · 9acc852
1 parent 3df69e7
commit 9acc852
Show file tree

Hide file tree

Showing 19 changed files with 302 additions and 132 deletions.
diff --git a/examples/opencpop/svs1/conf/default.yaml b/examples/opencpop/svs1/conf/default.yaml
@@ -34,6 +34,7 @@ model:
     # music score related
     note_num: 300                                     # number of note
     is_slur_num: 2                                    # number of slur
+    stretch: True                                     # whether to stretch before diffusion
 
     # fastspeech2 module
     fastspeech2_params:
@@ -142,15 +143,14 @@ ds_grad_norm: 1
 ###########################################################
 #                    INTERVAL SETTING                     #
 ###########################################################
+only_train_diffusion: True                 # Whether to freeze fastspeech2 parameters when training diffusion
 ds_train_start_steps: 160000              # Number of steps to start to train diffusion module.
 train_max_steps: 320000                   # Number of training steps.
 save_interval_steps: 2000                 # Interval steps to save checkpoint.
 eval_interval_steps: 2000                 # Interval steps to evaluate the network.
-num_snapshots: 5                        # Number of saved models
-
+num_snapshots: 5
 
 ###########################################################
 #                       OTHER SETTING                     #
 ###########################################################
 seed: 10086
-find_unused_parameters: True
diff --git a/examples/opencpop/svs1/local/preprocess.sh b/examples/opencpop/svs1/local/preprocess.sh
@@ -64,3 +64,11 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
         --phones-dict=dump/phone_id_map.txt \
         --speaker-dict=dump/speaker_id_map.txt
 fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    # Get feature(mel) extremum for diffusion stretch
+    echo "Get feature(mel) extremum  ..."
+    python3 ${BIN_DIR}/computer_extremum.py \
+        --metadata=dump/train/norm/metadata.jsonl \
+        --speech-stretchs=dump/train/speech_stretchs.npy
+fi
diff --git a/examples/opencpop/svs1/local/synthesize.sh b/examples/opencpop/svs1/local/synthesize.sh
@@ -3,8 +3,6 @@
 config_path=$1
 train_output_path=$2
 ckpt_name=$3
-#iter=$3
-#ckpt_name=snapshot_iter_${iter}.pdz
 stage=0
 stop_stage=0
 
@@ -21,8 +19,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --voc_config=pwgan_opencpop/default.yaml \
         --voc_ckpt=pwgan_opencpop/snapshot_iter_100000.pdz \
         --voc_stat=pwgan_opencpop/feats_stats.npy \
-        --test_metadata=test.jsonl \
-        --output_dir=${train_output_path}/test_${iter} \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/test \
         --phones_dict=dump/phone_id_map.txt
 fi
 
diff --git a/examples/opencpop/svs1/local/train.sh b/examples/opencpop/svs1/local/train.sh
@@ -9,4 +9,5 @@ python3 ${BIN_DIR}/train.py \
     --config=${config_path} \
     --output-dir=${train_output_path} \
     --ngpu=1 \
-    --phones-dict=dump/phone_id_map.txt
+    --phones-dict=dump/phone_id_map.txt \
+    --speech-stretchs=dump/train/speech_stretchs.npy
diff --git a/paddlespeech/t2s/datasets/get_feats.py b/paddlespeech/t2s/datasets/get_feats.py
@@ -12,18 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # Modified from espnet(https://github.com/espnet/espnet)
+from typing import List
+from typing import Optional
+from typing import Union
+
 import librosa
 import numpy as np
 import pyworld
 from scipy.interpolate import interp1d
-from typing import List
-
-from typing import Optional
-from typing import Union
 from typing_extensions import Literal
 
 
-
 class LogMelFBank():
     def __init__(self,
                  sr: int=24000,
@@ -80,7 +79,7 @@ def _stft(self, wav: np.ndarray):
 
     def _spectrogram(self, wav: np.ndarray):
         D = self._stft(wav)
-        return np.abs(D) ** self.power
+        return np.abs(D)**self.power
 
     def _mel_spectrogram(self, wav: np.ndarray):
         S = self._spectrogram(wav)
@@ -139,7 +138,7 @@ def _calculate_f0(self,
                       input: np.ndarray,
                       use_continuous_f0: bool=True,
                       use_log_f0: bool=True) -> np.ndarray:
-        input = input.astype(float)
+        input = input.astype(np.float32)
         frame_period = 1000 * self.hop_length / self.sr
         f0, timeaxis = pyworld.dio(
             input,

diff --git a/paddlespeech/t2s/exps/diffsinger/__init__.py b/paddlespeech/t2s/exps/diffsinger/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

diff --git a/paddlespeech/t2s/exps/diffsinger/computer_extremum.py b/paddlespeech/t2s/exps/diffsinger/computer_extremum.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import logging
+
+import jsonlines
+import numpy as np
+from tqdm import tqdm
+
+from paddlespeech.t2s.datasets.data_table import DataTable
+
+
+def find_min_max_spec(spec, min_spec, max_spec):
+    # spec: [T, 80]
+    for i in range(spec.shape[1]):
+        min_value = np.min(spec[:, i])
+        max_value = np.max(spec[:, i])
+        min_spec[i] = min(min_value, min_spec[i])
+        max_spec[i] = max(max_value, max_spec[i])
+
+    return min_spec, max_spec
+
+
+def main():
+    """Run preprocessing process."""
+    parser = argparse.ArgumentParser(
+        description="Normalize dumped raw features (See detail in parallel_wavegan/bin/normalize.py)."
+    )
+    parser.add_argument(
+        "--metadata",
+        type=str,
+        required=True,
+        help="directory including feature files to be normalized. "
+        "you need to specify either *-scp or rootdir.")
+
+    parser.add_argument(
+        "--speech-stretchs",
+        type=str,
+        required=True,
+        help="min max spec file. only computer on train data")
+
+    args = parser.parse_args()
+
+    # get dataset
+    with jsonlines.open(args.metadata, 'r') as reader:
+        metadata = list(reader)
+    dataset = DataTable(
+        metadata, converters={
+            "speech": np.load,
+        })
+    logging.info(f"The number of files = {len(dataset)}.")
+
+    n_mel = 80
+    min_spec = 100 * np.ones(shape=(n_mel), dtype=np.float32)
+    max_spec = -100 * np.ones(shape=(n_mel), dtype=np.float32)
+
+    for item in tqdm(dataset):
+        spec = item['speech']
+        min_spec, max_spec = find_min_max_spec(spec, min_spec, max_spec)
+
+    print(min_spec)
+    print(max_spec)
+
+    min_max_spec = np.stack([min_spec, max_spec], axis=0)
+    np.save(
+        str(args.speech_stretchs),
+        min_max_spec.astype(np.float32),
+        allow_pickle=False)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/paddlespeech/t2s/exps/diffsinger/normalize.py b/paddlespeech/t2s/exps/diffsinger/normalize.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

diff --git a/paddlespeech/t2s/exps/diffsinger/preprocess.py b/paddlespeech/t2s/exps/diffsinger/preprocess.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

diff --git a/paddlespeech/t2s/exps/diffsinger/train.py b/paddlespeech/t2s/exps/diffsinger/train.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -127,9 +127,21 @@ def train_sp(args, config):
     vocab_size = len(phn_id)
     print("vocab_size:", vocab_size)
 
+    with open(args.speech_stretchs, "r") as f:
+        spec_min = np.load(args.speech_stretchs)[0]
+        spec_max = np.load(args.speech_stretchs)[1]
+        spec_min = paddle.to_tensor(spec_min)
+        spec_max = paddle.to_tensor(spec_max)
+    print("min and max spec done!")
+
     odim = config.n_mels
     config["model"]["fastspeech2_params"]["spk_num"] = spk_num
-    model = DiffSinger(idim=vocab_size, odim=odim, **config["model"])
+    model = DiffSinger(
+        idim=vocab_size,
+        odim=odim,
+        **config["model"],
+        spec_min=spec_min,
+        spec_max=spec_max)
     model_fs2 = model.fs2
     model_ds = model.diffusion
     if world_size > 1:
@@ -143,13 +155,6 @@ def train_sp(args, config):
     print("criterions done!")
 
     optimizer_fs2 = build_optimizers(model_fs2, **config["fs2_optimizer"])
-    # gradient_clip_ds = nn.ClipGradByGlobalNorm(config["ds_grad_norm"])
-    # optimizer_ds = AdamW(
-    #     learning_rate=config["ds_scheduler_params"]["learning_rate"],
-    #     grad_clip=gradient_clip_ds,
-    #     parameters=model_ds.parameters(),
-    #     **config["ds_optimizer_params"])
-
     lr_schedule_ds = StepDecay(**config["ds_scheduler_params"])
     gradient_clip_ds = nn.ClipGradByGlobalNorm(config["ds_grad_norm"])
     optimizer_ds = AdamW(
@@ -178,7 +183,8 @@ def train_sp(args, config):
         },
         dataloader=train_dataloader,
         ds_train_start_steps=config.ds_train_start_steps,
-        output_dir=output_dir)
+        output_dir=output_dir,
+        only_train_diffusion=config["only_train_diffusion"])
 
     evaluator = DiffSingerEvaluator(
         model=model,
@@ -222,6 +228,10 @@ def main():
         type=str,
         default=None,
         help="speaker id map file for multiple speaker model.")
+    parser.add_argument(
+        "--speech-stretchs",
+        type=str,
+        help="The min and max values of the mel spectrum.")
 
     args = parser.parse_args()
 

diff --git a/paddlespeech/t2s/exps/syn_utils.py b/paddlespeech/t2s/exps/syn_utils.py
@@ -149,8 +149,6 @@ def get_test_dataset(test_metadata: List[Dict[str, Any]],
             print("single speaker fastspeech2!")
     elif am_name == 'diffsinger':
         fields = ["utt_id", "text", "note", "note_dur", "is_slur"]
-    elif am_name == 'fastspeech2midi':
-        fields = ["utt_id", "text", "note", "note_dur", "is_slur"]
     elif am_name == 'speedyspeech':
         fields = ["utt_id", "phones", "tones"]
     elif am_name == 'tacotron2':

diff --git a/paddlespeech/t2s/exps/synthesize.py b/paddlespeech/t2s/exps/synthesize.py
@@ -112,44 +112,29 @@ def evaluate(args):
                     note = paddle.to_tensor(datum["note"])
                     note_dur = paddle.to_tensor(datum["note_dur"])
                     is_slur = paddle.to_tensor(datum["is_slur"])
+                    # get_mel_fs2 = False, means mel from diffusion, get_mel_fs2 = True, means mel from fastspeech2.
                     get_mel_fs2 = False
                     # mel: [T, mel_bin]
-                    mel1 = am_inference(
-                        phone_ids,
-                        note=note,
-                        note_dur=note_dur,
-                        is_slur=is_slur,
-                        get_mel_fs2=True)
-                    mel2 = am_inference(
+                    mel = am_inference(
                         phone_ids,
                         note=note,
                         note_dur=note_dur,
                         is_slur=is_slur,
-                        get_mel_fs2=False)
-                wav1 = voc_inference(mel1)
-                wav2 = voc_inference(mel2)
+                        get_mel_fs2=get_mel_fs2)
+                # vocoder
+                wav = voc_inference(mel)
 
-            wav1 = wav1.numpy()
-            wav2 = wav2.numpy()
-            N += wav1.size
-            N += wav2.size
+            wav = wav.numpy()
+            N += wav.size
             T += t.elapse
-            speed = 2 * wav1.size / t.elapse
+            speed = wav.size / t.elapse
             rtf = am_config.fs / speed
         print(
-            f"{utt_id}, mel: {mel1.shape}, wave: {wav1.size}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
+            f"{utt_id}, mel: {mel.shape}, wave: {wav.size}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
         )
         sf.write(
-            str(output_dir / (utt_id + "_fs2.wav")),
-            wav1,
-            samplerate=am_config.fs)
-        sf.write(
-            str(output_dir / (utt_id + "_diffusion.wav")),
-            wav2,
-            samplerate=am_config.fs)
-
+            str(output_dir / (utt_id + ".wav")), wav, samplerate=am_config.fs)
         print(f"{utt_id} done!")
-        # break
     print(f"generation speed: {N / T}Hz, RTF: {am_config.fs / (N / T) }")
 
 

diff --git a/paddlespeech/t2s/models/diffsinger/__init__.py b/paddlespeech/t2s/models/diffsinger/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.