diff --git a/examples/opencpop/svs1/conf/default.yaml b/examples/opencpop/svs1/conf/default.yaml
index 57ffde9a261..32ad115a783 100644
--- a/examples/opencpop/svs1/conf/default.yaml
+++ b/examples/opencpop/svs1/conf/default.yaml
@@ -34,6 +34,7 @@ model:
     # music score related
     note_num: 300                                     # number of note
     is_slur_num: 2                                    # number of slur
+    stretch: True                                     # whether to stretch before diffusion
 
     # fastspeech2 module
     fastspeech2_params:
@@ -142,15 +143,14 @@ ds_grad_norm: 1
 ###########################################################
 #                    INTERVAL SETTING                     #
 ###########################################################
+only_train_diffusion: True                 # Whether to freeze fastspeech2 parameters when training diffusion
 ds_train_start_steps: 160000              # Number of steps to start to train diffusion module.
 train_max_steps: 320000                   # Number of training steps.
 save_interval_steps: 2000                 # Interval steps to save checkpoint.
 eval_interval_steps: 2000                 # Interval steps to evaluate the network.
-num_snapshots: 5                        # Number of saved models
-
+num_snapshots: 5
 
 ###########################################################
 #                       OTHER SETTING                     #
 ###########################################################
 seed: 10086
-find_unused_parameters: True
diff --git a/examples/opencpop/svs1/local/preprocess.sh b/examples/opencpop/svs1/local/preprocess.sh
index 1c98ca84def..a7f1e4d4893 100755
--- a/examples/opencpop/svs1/local/preprocess.sh
+++ b/examples/opencpop/svs1/local/preprocess.sh
@@ -64,3 +64,11 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
         --phones-dict=dump/phone_id_map.txt \
         --speaker-dict=dump/speaker_id_map.txt
 fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    # Get feature(mel) extremum for diffusion stretch
+    echo "Get feature(mel) extremum  ..."
+    python3 ${BIN_DIR}/computer_extremum.py \
+        --metadata=dump/train/norm/metadata.jsonl \
+        --speech-stretchs=dump/train/speech_stretchs.npy
+fi
diff --git a/examples/opencpop/svs1/local/synthesize.sh b/examples/opencpop/svs1/local/synthesize.sh
index dae0c632391..83d17591ab0 100755
--- a/examples/opencpop/svs1/local/synthesize.sh
+++ b/examples/opencpop/svs1/local/synthesize.sh
@@ -3,8 +3,6 @@
 config_path=$1
 train_output_path=$2
 ckpt_name=$3
-#iter=$3
-#ckpt_name=snapshot_iter_${iter}.pdz
 stage=0
 stop_stage=0
 
@@ -21,8 +19,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --voc_config=pwgan_opencpop/default.yaml \
         --voc_ckpt=pwgan_opencpop/snapshot_iter_100000.pdz \
         --voc_stat=pwgan_opencpop/feats_stats.npy \
-        --test_metadata=test.jsonl \
-        --output_dir=${train_output_path}/test_${iter} \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/test \
         --phones_dict=dump/phone_id_map.txt
 fi
 
diff --git a/examples/opencpop/svs1/local/train.sh b/examples/opencpop/svs1/local/train.sh
index d1302f99ff1..5be624fc455 100755
--- a/examples/opencpop/svs1/local/train.sh
+++ b/examples/opencpop/svs1/local/train.sh
@@ -9,4 +9,5 @@ python3 ${BIN_DIR}/train.py \
     --config=${config_path} \
     --output-dir=${train_output_path} \
     --ngpu=1 \
-    --phones-dict=dump/phone_id_map.txt
+    --phones-dict=dump/phone_id_map.txt \
+    --speech-stretchs=dump/train/speech_stretchs.npy
diff --git a/paddlespeech/t2s/datasets/get_feats.py b/paddlespeech/t2s/datasets/get_feats.py
index 164b2fb4a6a..6365dd7fc0b 100644
--- a/paddlespeech/t2s/datasets/get_feats.py
+++ b/paddlespeech/t2s/datasets/get_feats.py
@@ -12,18 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # Modified from espnet(https://github.com/espnet/espnet)
+from typing import List
+from typing import Optional
+from typing import Union
+
 import librosa
 import numpy as np
 import pyworld
 from scipy.interpolate import interp1d
-from typing import List
-
-from typing import Optional
-from typing import Union
 from typing_extensions import Literal
 
 
-
 class LogMelFBank():
     def __init__(self,
                  sr: int=24000,
@@ -80,7 +79,7 @@ def _stft(self, wav: np.ndarray):
 
     def _spectrogram(self, wav: np.ndarray):
         D = self._stft(wav)
-        return np.abs(D) ** self.power
+        return np.abs(D)**self.power
 
     def _mel_spectrogram(self, wav: np.ndarray):
         S = self._spectrogram(wav)
@@ -139,7 +138,7 @@ def _calculate_f0(self,
                       input: np.ndarray,
                       use_continuous_f0: bool=True,
                       use_log_f0: bool=True) -> np.ndarray:
-        input = input.astype(float)
+        input = input.astype(np.float32)
         frame_period = 1000 * self.hop_length / self.sr
         f0, timeaxis = pyworld.dio(
             input,
diff --git a/paddlespeech/t2s/exps/diffsinger/__init__.py b/paddlespeech/t2s/exps/diffsinger/__init__.py
index abf198b97e6..595add0aed9 100644
--- a/paddlespeech/t2s/exps/diffsinger/__init__.py
+++ b/paddlespeech/t2s/exps/diffsinger/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddlespeech/t2s/exps/diffsinger/computer_extremum.py b/paddlespeech/t2s/exps/diffsinger/computer_extremum.py
new file mode 100644
index 00000000000..4cb4500f49a
--- /dev/null
+++ b/paddlespeech/t2s/exps/diffsinger/computer_extremum.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import logging
+
+import jsonlines
+import numpy as np
+from tqdm import tqdm
+
+from paddlespeech.t2s.datasets.data_table import DataTable
+
+
+def find_min_max_spec(spec, min_spec, max_spec):
+    # spec: [T, 80]
+    for i in range(spec.shape[1]):
+        min_value = np.min(spec[:, i])
+        max_value = np.max(spec[:, i])
+        min_spec[i] = min(min_value, min_spec[i])
+        max_spec[i] = max(max_value, max_spec[i])
+
+    return min_spec, max_spec
+
+
+def main():
+    """Run preprocessing process."""
+    parser = argparse.ArgumentParser(
+        description="Normalize dumped raw features (See detail in parallel_wavegan/bin/normalize.py)."
+    )
+    parser.add_argument(
+        "--metadata",
+        type=str,
+        required=True,
+        help="directory including feature files to be normalized. "
+        "you need to specify either *-scp or rootdir.")
+
+    parser.add_argument(
+        "--speech-stretchs",
+        type=str,
+        required=True,
+        help="min max spec file. only computer on train data")
+
+    args = parser.parse_args()
+
+    # get dataset
+    with jsonlines.open(args.metadata, 'r') as reader:
+        metadata = list(reader)
+    dataset = DataTable(
+        metadata, converters={
+            "speech": np.load,
+        })
+    logging.info(f"The number of files = {len(dataset)}.")
+
+    n_mel = 80
+    min_spec = 100 * np.ones(shape=(n_mel), dtype=np.float32)
+    max_spec = -100 * np.ones(shape=(n_mel), dtype=np.float32)
+
+    for item in tqdm(dataset):
+        spec = item['speech']
+        min_spec, max_spec = find_min_max_spec(spec, min_spec, max_spec)
+
+    print(min_spec)
+    print(max_spec)
+
+    min_max_spec = np.stack([min_spec, max_spec], axis=0)
+    np.save(
+        str(args.speech_stretchs),
+        min_max_spec.astype(np.float32),
+        allow_pickle=False)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/paddlespeech/t2s/exps/diffsinger/normalize.py b/paddlespeech/t2s/exps/diffsinger/normalize.py
index dec6127e1f9..d3e6116210b 100644
--- a/paddlespeech/t2s/exps/diffsinger/normalize.py
+++ b/paddlespeech/t2s/exps/diffsinger/normalize.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddlespeech/t2s/exps/diffsinger/preprocess.py b/paddlespeech/t2s/exps/diffsinger/preprocess.py
index d5209c2c57b..be526eff117 100644
--- a/paddlespeech/t2s/exps/diffsinger/preprocess.py
+++ b/paddlespeech/t2s/exps/diffsinger/preprocess.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddlespeech/t2s/exps/diffsinger/train.py b/paddlespeech/t2s/exps/diffsinger/train.py
index 5e834b3d38f..3f062eefcb8 100644
--- a/paddlespeech/t2s/exps/diffsinger/train.py
+++ b/paddlespeech/t2s/exps/diffsinger/train.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -127,9 +127,21 @@ def train_sp(args, config):
     vocab_size = len(phn_id)
     print("vocab_size:", vocab_size)
 
+    with open(args.speech_stretchs, "r") as f:
+        spec_min = np.load(args.speech_stretchs)[0]
+        spec_max = np.load(args.speech_stretchs)[1]
+        spec_min = paddle.to_tensor(spec_min)
+        spec_max = paddle.to_tensor(spec_max)
+    print("min and max spec done!")
+
     odim = config.n_mels
     config["model"]["fastspeech2_params"]["spk_num"] = spk_num
-    model = DiffSinger(idim=vocab_size, odim=odim, **config["model"])
+    model = DiffSinger(
+        idim=vocab_size,
+        odim=odim,
+        **config["model"],
+        spec_min=spec_min,
+        spec_max=spec_max)
     model_fs2 = model.fs2
     model_ds = model.diffusion
     if world_size > 1:
@@ -143,13 +155,6 @@ def train_sp(args, config):
     print("criterions done!")
 
     optimizer_fs2 = build_optimizers(model_fs2, **config["fs2_optimizer"])
-    # gradient_clip_ds = nn.ClipGradByGlobalNorm(config["ds_grad_norm"])
-    # optimizer_ds = AdamW(
-    #     learning_rate=config["ds_scheduler_params"]["learning_rate"],
-    #     grad_clip=gradient_clip_ds,
-    #     parameters=model_ds.parameters(),
-    #     **config["ds_optimizer_params"])
-
     lr_schedule_ds = StepDecay(**config["ds_scheduler_params"])
     gradient_clip_ds = nn.ClipGradByGlobalNorm(config["ds_grad_norm"])
     optimizer_ds = AdamW(
@@ -178,7 +183,8 @@ def train_sp(args, config):
         },
         dataloader=train_dataloader,
         ds_train_start_steps=config.ds_train_start_steps,
-        output_dir=output_dir)
+        output_dir=output_dir,
+        only_train_diffusion=config["only_train_diffusion"])
 
     evaluator = DiffSingerEvaluator(
         model=model,
@@ -222,6 +228,10 @@ def main():
         type=str,
         default=None,
         help="speaker id map file for multiple speaker model.")
+    parser.add_argument(
+        "--speech-stretchs",
+        type=str,
+        help="The min and max values of the mel spectrum.")
 
     args = parser.parse_args()
 
diff --git a/paddlespeech/t2s/exps/syn_utils.py b/paddlespeech/t2s/exps/syn_utils.py
index b1a8add0ada..dd580c9b3e9 100644
--- a/paddlespeech/t2s/exps/syn_utils.py
+++ b/paddlespeech/t2s/exps/syn_utils.py
@@ -149,8 +149,6 @@ def get_test_dataset(test_metadata: List[Dict[str, Any]],
             print("single speaker fastspeech2!")
     elif am_name == 'diffsinger':
         fields = ["utt_id", "text", "note", "note_dur", "is_slur"]
-    elif am_name == 'fastspeech2midi':
-        fields = ["utt_id", "text", "note", "note_dur", "is_slur"]
     elif am_name == 'speedyspeech':
         fields = ["utt_id", "phones", "tones"]
     elif am_name == 'tacotron2':
diff --git a/paddlespeech/t2s/exps/synthesize.py b/paddlespeech/t2s/exps/synthesize.py
index 64ac9fa18b0..410ae17fbad 100644
--- a/paddlespeech/t2s/exps/synthesize.py
+++ b/paddlespeech/t2s/exps/synthesize.py
@@ -112,44 +112,29 @@ def evaluate(args):
                     note = paddle.to_tensor(datum["note"])
                     note_dur = paddle.to_tensor(datum["note_dur"])
                     is_slur = paddle.to_tensor(datum["is_slur"])
+                    # get_mel_fs2 = False, means mel from diffusion, get_mel_fs2 = True, means mel from fastspeech2.
                     get_mel_fs2 = False
                     # mel: [T, mel_bin]
-                    mel1 = am_inference(
-                        phone_ids,
-                        note=note,
-                        note_dur=note_dur,
-                        is_slur=is_slur,
-                        get_mel_fs2=True)
-                    mel2 = am_inference(
+                    mel = am_inference(
                         phone_ids,
                         note=note,
                         note_dur=note_dur,
                         is_slur=is_slur,
-                        get_mel_fs2=False)
-                wav1 = voc_inference(mel1)
-                wav2 = voc_inference(mel2)
+                        get_mel_fs2=get_mel_fs2)
+                # vocoder
+                wav = voc_inference(mel)
 
-            wav1 = wav1.numpy()
-            wav2 = wav2.numpy()
-            N += wav1.size
-            N += wav2.size
+            wav = wav.numpy()
+            N += wav.size
             T += t.elapse
-            speed = 2 * wav1.size / t.elapse
+            speed = wav.size / t.elapse
             rtf = am_config.fs / speed
         print(
-            f"{utt_id}, mel: {mel1.shape}, wave: {wav1.size}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
+            f"{utt_id}, mel: {mel.shape}, wave: {wav.size}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
         )
         sf.write(
-            str(output_dir / (utt_id + "_fs2.wav")),
-            wav1,
-            samplerate=am_config.fs)
-        sf.write(
-            str(output_dir / (utt_id + "_diffusion.wav")),
-            wav2,
-            samplerate=am_config.fs)
-
+            str(output_dir / (utt_id + ".wav")), wav, samplerate=am_config.fs)
         print(f"{utt_id} done!")
-        # break
     print(f"generation speed: {N / T}Hz, RTF: {am_config.fs / (N / T) }")
 
 
diff --git a/paddlespeech/t2s/models/diffsinger/__init__.py b/paddlespeech/t2s/models/diffsinger/__init__.py
index d07a45711be..785293ee23f 100644
--- a/paddlespeech/t2s/models/diffsinger/__init__.py
+++ b/paddlespeech/t2s/models/diffsinger/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddlespeech/t2s/models/diffsinger/diffsinger.py b/paddlespeech/t2s/models/diffsinger/diffsinger.py
index b86d835bce3..50f7199183a 100644
--- a/paddlespeech/t2s/models/diffsinger/diffsinger.py
+++ b/paddlespeech/t2s/models/diffsinger/diffsinger.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -136,7 +136,9 @@ def __init__(
                 "beta_schedule": "squaredcos_cap_v2",
                 "num_max_timesteps": 60
             },
-            stretch: bool=True, ):
+            stretch: bool=True,
+            spec_min: paddle.Tensor=None,
+            spec_max: paddle.Tensor=None, ):
         """Initialize DiffSinger module.
 
         Args:
@@ -149,6 +151,7 @@ def __init__(
             fastspeech2_params (Dict[str, Any]): Parameter dict for fastspeech2 module.
             denoiser_params (Dict[str, Any]): Parameter dict for dinoiser module.
             diffusion_params (Dict[str, Any]): Parameter dict for diffusion module.
+            stretch (bool): Whether to stretch before diffusion. Defaults True.
         """
         assert check_argument_types()
         super().__init__()
@@ -159,33 +162,6 @@ def __init__(
             note_num=note_num,
             is_slur_num=is_slur_num)
         denoiser = DiffNet(**denoiser_params)
-        spec_min = paddle.to_tensor(
-            np.array([
-                -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0,
-                -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0,
-                -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0,
-                -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0,
-                -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0,
-                -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0,
-                -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0,
-                -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0, -6.0
-            ]))
-        spec_max = paddle.to_tensor(
-            np.array([
-                -0.79453, -0.81116, -0.61631, -0.30679, -0.13863, -0.050652,
-                -0.11563, -0.10679, -0.091068, -0.062174, -0.075302, -0.072217,
-                -0.063815, -0.073299, 0.007361, -0.072508, -0.050234, -0.16534,
-                -0.26928, -0.20782, -0.20823, -0.11702, -0.070128, -0.065868,
-                -0.012675, 0.0015121, -0.089902, -0.21392, -0.23789, -0.28922,
-                -0.30405, -0.23029, -0.22088, -0.21542, -0.29367, -0.30137,
-                -0.38281, -0.4359, -0.28681, -0.46855, -0.57485, -0.47022,
-                -0.54266, -0.44848, -0.6412, -0.687, -0.6486, -0.76436,
-                -0.49971, -0.71068, -0.69724, -0.61487, -0.55843, -0.69773,
-                -0.57502, -0.70919, -0.82431, -0.84213, -0.90431, -0.8284,
-                -0.77945, -0.82758, -0.87699, -1.0532, -1.0766, -1.1198,
-                -1.0185, -0.98983, -1.0001, -1.0756, -1.0024, -1.0304, -1.0579,
-                -1.0188, -1.05, -1.0842, -1.0923, -1.1223, -1.2381, -1.6467
-            ]))
         self.diffusion = GaussianDiffusion(
             denoiser,
             **diffusion_params,
diff --git a/paddlespeech/t2s/models/diffsinger/diffsinger_updater.py b/paddlespeech/t2s/models/diffsinger/diffsinger_updater.py
index 1ec1be5be97..018b781d1ed 100644
--- a/paddlespeech/t2s/models/diffsinger/diffsinger_updater.py
+++ b/paddlespeech/t2s/models/diffsinger/diffsinger_updater.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -139,9 +139,8 @@ def update_core(self, batch):
 
         # Then only train diffusion module, freeze fastspeech2 parameters.
         if self.state.iteration > self.ds_train_start_steps:
-            if self.only_train_diffusion:
-                for param in self.model.fs2.parameters():
-                    param.trainable = False
+            for param in self.model.fs2.parameters():
+                param.trainable = False if self.only_train_diffusion else True
 
             noise_pred, noise_target, mel_masks = self.model(
                 text=batch["text"],
@@ -213,7 +212,59 @@ def evaluate_core(self, batch):
         if spk_emb is not None:
             spk_id = None
 
-        # Here show diffsinger eval     
+        # Here show fastspeech2 eval 
+        before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens, spk_logits = self.model(
+            text=batch["text"],
+            note=batch["note"],
+            note_dur=batch["note_dur"],
+            is_slur=batch["is_slur"],
+            text_lengths=batch["text_lengths"],
+            speech=batch["speech"],
+            speech_lengths=batch["speech_lengths"],
+            durations=batch["durations"],
+            pitch=batch["pitch"],
+            energy=batch["energy"],
+            spk_id=spk_id,
+            spk_emb=spk_emb,
+            only_train_fs2=True, )
+
+        l1_loss_fs2, ssim_loss_fs2, duration_loss, pitch_loss, energy_loss, speaker_loss = self.criterion_fs2(
+            after_outs=after_outs,
+            before_outs=before_outs,
+            d_outs=d_outs,
+            p_outs=p_outs,
+            e_outs=e_outs,
+            ys=ys,
+            ds=batch["durations"],
+            ps=batch["pitch"],
+            es=batch["energy"],
+            ilens=batch["text_lengths"],
+            olens=olens,
+            spk_logits=spk_logits,
+            spk_ids=spk_id, )
+
+        loss_fs2 = l1_loss_fs2 + ssim_loss_fs2 + duration_loss + pitch_loss + energy_loss + speaker_loss
+
+        report("eval/loss_fs2", float(loss_fs2))
+        report("eval/l1_loss_fs2", float(l1_loss_fs2))
+        report("eval/ssim_loss_fs2", float(ssim_loss_fs2))
+        report("eval/duration_loss", float(duration_loss))
+        report("eval/pitch_loss", float(pitch_loss))
+        report("eval/energy_loss", float(energy_loss))
+
+        losses_dict["l1_loss_fs2"] = float(l1_loss_fs2)
+        losses_dict["ssim_loss_fs2"] = float(ssim_loss_fs2)
+        losses_dict["duration_loss"] = float(duration_loss)
+        losses_dict["pitch_loss"] = float(pitch_loss)
+        losses_dict["energy_loss"] = float(energy_loss)
+
+        if speaker_loss != 0.:
+            report("eval/speaker_loss", float(speaker_loss))
+            losses_dict["speaker_loss"] = float(speaker_loss)
+
+        losses_dict["loss_fs2"] = float(loss_fs2)
+
+        # Here show diffusion eval
         noise_pred, noise_target, mel_masks = self.model(
             text=batch["text"],
             note=batch["note"],
@@ -236,6 +287,7 @@ def evaluate_core(self, batch):
             noise_pred=noise_pred,
             noise_target=noise_target,
             mel_masks=mel_masks, )
+
         loss_ds = l1_loss_ds
 
         report("eval/loss_ds", float(loss_ds))
diff --git a/paddlespeech/t2s/models/diffsinger/fastspeech2midi.py b/paddlespeech/t2s/models/diffsinger/fastspeech2midi.py
index 53a72ebe674..7846779dbca 100644
--- a/paddlespeech/t2s/models/diffsinger/fastspeech2midi.py
+++ b/paddlespeech/t2s/models/diffsinger/fastspeech2midi.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -175,18 +175,18 @@ def _forward(
 
         before_outs = after_outs = d_outs = p_outs = e_outs = spk_logits = None
         # forward encoder
-        x_masks = self._source_mask(ilens)
+        masks = self._source_mask(ilens)
         note_emb = self.note_embedding_table(note)
         note_dur_emb = self.note_dur_layer(paddle.unsqueeze(note_dur, axis=-1))
         is_slur_emb = self.is_slur_embedding_table(is_slur)
 
         # (B, Tmax, adim)
         hs, _ = self.encoder(
-            xs,
-            x_masks,
-            note_emb,
-            note_dur_emb,
-            is_slur_emb, )
+            xs=xs,
+            masks=masks,
+            note_emb=note_emb,
+            note_dur_emb=note_dur_emb,
+            is_slur_emb=is_slur_emb, )
 
         if self.spk_num and self.enable_speaker_classifier and not is_inference:
             hs_for_spk_cls = self.grad_reverse(hs)
diff --git a/paddlespeech/t2s/modules/diffnet.py b/paddlespeech/t2s/modules/diffnet.py
index 6a87c55374e..25339daea8b 100644
--- a/paddlespeech/t2s/modules/diffnet.py
+++ b/paddlespeech/t2s/modules/diffnet.py
@@ -52,10 +52,26 @@ def Linear(*args, **kwargs):
 
 class ResidualBlock(nn.Layer):
     """ResidualBlock
+
+    Args:
+        encoder_hidden (int, optional): 
+            Input feature size of the 1D convolution, by default 256
+        residual_channels (int, optional): 
+            Feature size of the residual output(and also the input), by default 256
+        gate_channels (int, optional): 
+            Output feature size of the 1D convolution, by default 512
+        kernel_size (int, optional): 
+            Kernel size of the 1D convolution, by default 3
+        dilation (int, optional): 
+            Dilation of the 1D convolution, by default 4
     """
 
-    def __init__(self, encoder_hidden, residual_channels, gate_channels,
-                 kernel_size, dilation):
+    def __init__(self,
+                 encoder_hidden: int=256,
+                 residual_channels: int=256,
+                 gate_channels: int=512,
+                 kernel_size: int=3,
+                 dilation: int=4):
         super().__init__()
         self.dilated_conv = Conv1D(
             residual_channels,
@@ -67,17 +83,26 @@ def __init__(self, encoder_hidden, residual_channels, gate_channels,
         self.conditioner_projection = Conv1D(encoder_hidden, gate_channels, 1)
         self.output_projection = Conv1D(residual_channels, gate_channels, 1)
 
-    def forward(self, x, conditioner, diffusion_step):
-        """_summary_
-
+    def forward(
+            self,
+            x: paddle.Tensor,
+            diffusion_step: paddle.Tensor,
+            cond: paddle.Tensor, ):
+        """Calculate forward propagation.
         Args:
-            nn (_type_): _description_
+            spec (Tensor(float32)): input feature. (B, residual_channels, T)
+            diffusion_step (Tensor(int64)):  The timestep input (adding noise step). (B,)
+            cond (Tensor(float32)): The auxiliary input (e.g. fastspeech2 encoder output). (B, residual_channels, T)
+
+        Returns:
+            x (Tensor(float32)): output (B, residual_channels, T)
+
         """
         diffusion_step = self.diffusion_projection(diffusion_step).unsqueeze(-1)
-        conditioner = self.conditioner_projection(conditioner)
+        cond = self.conditioner_projection(cond)
         y = x + diffusion_step
 
-        y = self.dilated_conv(y) + conditioner
+        y = self.dilated_conv(y) + cond
 
         gate, filter = paddle.chunk(y, 2, axis=1)
         y = F.sigmoid(gate) * paddle.tanh(filter)
@@ -88,22 +113,14 @@ def forward(self, x, conditioner, diffusion_step):
 
 
 class SinusoidalPosEmb(nn.Layer):
-    """_summary_
-
-    Args:
-        nn (_type_): _description_
+    """Positional embedding
     """
 
-    def __init__(self, dim):
+    def __init__(self, dim: int=256):
         super().__init__()
         self.dim = dim
 
-    def forward(self, x):
-        """_summary_
-
-        Args:
-            nn (_type_): _description_
-        """
+    def forward(self, x: paddle.Tensor):
         x = paddle.cast(x, 'float32')
         half_dim = self.dim // 2
         emb = math.log(10000) / (half_dim - 1)
@@ -114,6 +131,36 @@ def forward(self, x):
 
 
 class DiffNet(nn.Layer):
+    """A Mel-Spectrogram Denoiser
+
+    Args:
+        in_channels (int, optional): 
+            Number of channels of the input mel-spectrogram, by default 80
+        out_channels (int, optional): 
+            Number of channels of the output mel-spectrogram, by default 80
+        kernel_size (int, optional): 
+            Kernel size of the residual blocks inside, by default 3
+        layers (int, optional): 
+            Number of residual blocks inside, by default 20
+        stacks (int, optional):
+            The number of groups to split the residual blocks into, by default 5
+            Within each group, the dilation of the residual block grows exponentially.
+        residual_channels (int, optional): 
+            Residual channel of the residual blocks, by default 256
+        gate_channels (int, optional): 
+            Gate channel of the residual blocks, by default 512
+        skip_channels (int, optional): 
+            Skip channel of the residual blocks, by default 256
+        aux_channels (int, optional): 
+            Auxiliary channel of the residual blocks, by default 256
+        dropout (float, optional): 
+            Dropout of the residual blocks, by default 0.
+        bias (bool, optional): 
+            Whether to use bias in residual blocks, by default True
+        use_weight_norm (bool, optional): 
+            Whether to use weight norm in all convolutions, by default False
+    """
+
     def __init__(
             self,
             in_channels: int=80,
@@ -162,13 +209,20 @@ def __init__(
                                         self.out_channels, 1)
         zeros_(self.output_projection.weight)
 
-    def forward(self, spec, diffusion_step, cond):
-        """
+    def forward(
+            self,
+            spec: paddle.Tensor,
+            diffusion_step: paddle.Tensor,
+            cond: paddle.Tensor, ):
+        """Calculate forward propagation.
+        Args:
+            spec (Tensor(float32)): The input mel-spectrogram. (B, n_mel, T)
+            diffusion_step (Tensor(int64)):  The timestep input (adding noise step). (B,)
+            cond (Tensor(float32)): The auxiliary input (e.g. fastspeech2 encoder output). (B, D_enc_out, T)
+
+        Returns:
+            x (Tensor(float32)): pred noise (B, n_mel, T)
 
-        :param spec: [B, M, T]
-        :param diffusion_step: [B, 1]
-        :param cond: [B, M, T]
-        :return:
         """
         x = spec
         x = self.input_projection(x)  # x [B, residual_channel, T]
@@ -178,7 +232,10 @@ def forward(self, spec, diffusion_step, cond):
         diffusion_step = self.mlp(diffusion_step)
         skip = []
         for layer_id, layer in enumerate(self.residual_layers):
-            x, skip_connection = layer(x, cond, diffusion_step)
+            x, skip_connection = layer(
+                x=x,
+                diffusion_step=diffusion_step,
+                cond=cond, )
             skip.append(skip_connection)
         x = paddle.sum(
             paddle.stack(skip), axis=0) / math.sqrt(len(self.residual_layers))
diff --git a/paddlespeech/t2s/modules/diffusion.py b/paddlespeech/t2s/modules/diffusion.py
index 4ada574899d..3222a8032a9 100644
--- a/paddlespeech/t2s/modules/diffusion.py
+++ b/paddlespeech/t2s/modules/diffusion.py
@@ -44,6 +44,13 @@ class GaussianDiffusion(nn.Layer):
             beta schedule parameter for the scheduler, by default 'squaredcos_cap_v2' (cosine schedule).
         num_max_timesteps (int, optional): 
             The max timestep transition from real to noise, by default None.
+        stretch (bool, optional): 
+            Whether to stretch before diffusion, by defalut True.
+        min_values: (paddle.Tensor):
+            The minimum value of the feature to stretch.
+        max_values: (paddle.Tensor):
+            The maximum value of the feature to stretch.
+    
     
     Examples: 
         >>> import paddle
@@ -191,7 +198,6 @@ def forward(self, x: paddle.Tensor, cond: Optional[paddle.Tensor]=None
 
         """
         if self.stretch:
-            assert self.min_values is not None and self.max_values is not None, "self.min_values and self.max_values should not be None."
             x = x.transpose((0, 2, 1))
             x = self.norm_spec(x)
             x = x.transpose((0, 2, 1))
@@ -291,7 +297,6 @@ def inference(self,
 
         noisy_input = noise
         if self.stretch and ref_x is not None:
-            assert self.min_values is not None and self.max_values is not None, "self.min_values and self.max_values should not be None."
             ref_x = ref_x.transpose((0, 2, 1))
             ref_x = self.norm_spec(ref_x)
             ref_x = ref_x.transpose((0, 2, 1))
@@ -315,7 +320,6 @@ def inference(self,
                 denoised_output = paddle.clip(denoised_output, n_min, n_max)
 
         if self.stretch:
-            assert self.min_values is not None and self.max_values is not None, "self.min_values and self.max_values should not be None."
             denoised_output = denoised_output.transpose((0, 2, 1))
             denoised_output = self.denorm_spec(denoised_output)
             denoised_output = denoised_output.transpose((0, 2, 1))
diff --git a/paddlespeech/t2s/modules/wavenet_denoiser.py b/paddlespeech/t2s/modules/wavenet_denoiser.py
index 471a822b640..f84a0893dcd 100644
--- a/paddlespeech/t2s/modules/wavenet_denoiser.py
+++ b/paddlespeech/t2s/modules/wavenet_denoiser.py
@@ -131,19 +131,19 @@ def __init__(
         if use_weight_norm:
             self.apply_weight_norm()
 
-    def forward(self, x, t, c):
+    def forward(self, x: paddle.Tensor, t: paddle.Tensor, c: paddle.Tensor):
         """Denoise mel-spectrogram.
 
         Args:
             x(Tensor): 
-                Shape (N, C_in, T), The input mel-spectrogram.
+                Shape (B, C_in, T), The input mel-spectrogram.
             t(Tensor): 
-                Shape (N), The timestep input.
+                Shape (B), The timestep input.
             c(Tensor): 
-                Shape (N, C_aux, T'). The auxiliary input (e.g. fastspeech2 encoder output). 
+                Shape (B, C_aux, T'). The auxiliary input (e.g. fastspeech2 encoder output).
 
         Returns:
-            Tensor: Shape (N, C_out, T), the denoised mel-spectrogram.
+            Tensor: Shape (B, C_out, T), the pred noise.
         """
         assert c.shape[-1] == x.shape[-1]
 
@@ -189,4 +189,3 @@ def _remove_weight_norm(layer):
                 pass
 
         self.apply(_remove_weight_norm)
-