diff --git a/paddlespeech/t2s/models/diffsinger/diffsinger.py b/paddlespeech/t2s/models/diffsinger/diffsinger.py index 1fa4dfd39d3..b86d835bce3 100644 --- a/paddlespeech/t2s/models/diffsinger/diffsinger.py +++ b/paddlespeech/t2s/models/diffsinger/diffsinger.py @@ -348,7 +348,7 @@ def forward(self, text, note, note_dur, is_slur, get_mel_fs2: bool=False): note_dur=note_dur, is_slur=is_slur, get_mel_fs2=get_mel_fs2) - logmel = self.normalizer.inverse(normalized_mel) + logmel = normalized_mel return logmel diff --git a/paddlespeech/t2s/modules/diffusion.py b/paddlespeech/t2s/modules/diffusion.py index 621dfe5304b..4ada574899d 100644 --- a/paddlespeech/t2s/modules/diffusion.py +++ b/paddlespeech/t2s/modules/diffusion.py @@ -223,7 +223,7 @@ def inference(self, num_inference_steps: Optional[int]=1000, strength: Optional[float]=None, scheduler_type: Optional[str]="ddpm", - clip_noise: Optional[bool]=True, + clip_noise: Optional[bool]=False, clip_noise_range: Optional[Tuple[float, float]]=(-1, 1), callback: Optional[Callable[[int, int, int, paddle.Tensor], None]]=None, @@ -302,6 +302,9 @@ def inference(self, noisy_input = scheduler.add_noise(ref_x, noise, timesteps[0]) denoised_output = noisy_input + if clip_noise: + n_min, n_max = clip_noise_range + denoised_output = paddle.clip(denoised_output, n_min, n_max) for i, t in enumerate(timesteps): denoised_output = scheduler.scale_model_input(denoised_output, t) noise_pred = self.denoiser(denoised_output, t, cond)