PaddlePaddle · yt605155624 · May 12, 2022 · May 12, 2022
diff --git a/paddlespeech/t2s/exps/fastspeech2/preprocess.py b/paddlespeech/t2s/exps/fastspeech2/preprocess.py
@@ -55,8 +55,11 @@ def process_sentence(config: Dict[str, Any],
     if utt_id in sentences:
         # reading, resampling may occur
         wav, _ = librosa.load(str(fp), sr=config.fs)
-        if len(wav.shape) != 1 or np.abs(wav).max() > 1.0:
+        if len(wav.shape) != 1:
             return record
+        max_value = np.abs(wav).max()
+        if max_value > 1.0:
+            wav = wav / max_value
         assert len(wav.shape) == 1, f"{utt_id} is not a mono-channel audio."
         assert np.abs(wav).max(
         ) <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM."

diff --git a/paddlespeech/t2s/exps/gan_vocoder/preprocess.py b/paddlespeech/t2s/exps/gan_vocoder/preprocess.py
@@ -47,8 +47,11 @@ def process_sentence(config: Dict[str, Any],
     if utt_id in sentences:
         # reading, resampling may occur
         y, _ = librosa.load(str(fp), sr=config.fs)
-        if len(y.shape) != 1 or np.abs(y).max() > 1.0:
+        if len(y.shape) != 1:
             return record
+        max_value = np.abs(y).max()
+        if max_value > 1.0:
+            y = y / max_value
         assert len(y.shape) == 1, f"{utt_id} is not a mono-channel audio."
         assert np.abs(y).max(
         ) <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM."

diff --git a/paddlespeech/t2s/exps/speedyspeech/preprocess.py b/paddlespeech/t2s/exps/speedyspeech/preprocess.py
@@ -47,8 +47,11 @@ def process_sentence(config: Dict[str, Any],
     if utt_id in sentences:
         # reading, resampling may occur
         wav, _ = librosa.load(str(fp), sr=config.fs)
-        if len(wav.shape) != 1 or np.abs(wav).max() > 1.0:
+        if len(wav.shape) != 1:
             return record
+        max_value = np.abs(wav).max()
+        if max_value > 1.0:
+            wav = wav / max_value
         assert len(wav.shape) == 1, f"{utt_id} is not a mono-channel audio."
         assert np.abs(wav).max(
         ) <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM."

diff --git a/paddlespeech/t2s/exps/tacotron2/preprocess.py b/paddlespeech/t2s/exps/tacotron2/preprocess.py
@@ -51,8 +51,11 @@ def process_sentence(config: Dict[str, Any],
     if utt_id in sentences:
         # reading, resampling may occur
         wav, _ = librosa.load(str(fp), sr=config.fs)
-        if len(wav.shape) != 1 or np.abs(wav).max() > 1.0:
+        if len(wav.shape) != 1:
             return record
+        max_value = np.abs(wav).max()
+        if max_value > 1.0:
+            wav = wav / max_value
         assert len(wav.shape) == 1, f"{utt_id} is not a mono-channel audio."
         assert np.abs(wav).max(
         ) <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM."