[TTS]Add slim for TTS (#2729)

PaddlePaddle · Dec 9, 2022 · 3f6afc4 · 3f6afc4
1 parent 6f927d5
commit 3f6afc4
Show file tree

Hide file tree

Showing 17 changed files with 513 additions and 8 deletions.
diff --git a/examples/csmsc/tts2/local/PTQ_static.sh b/examples/csmsc/tts2/local/PTQ_static.sh
@@ -0,0 +1 @@
+../../tts3/local/PTQ_static.sh
diff --git a/examples/csmsc/tts2/run.sh b/examples/csmsc/tts2/run.sh
@@ -72,3 +72,8 @@ fi
 if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
     CUDA_VISIBLE_DEVICES=${gpus} ./local/lite_predict.sh ${train_output_path} || exit -1
 fi
+
+# PTQ_static
+if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/PTQ_static.sh  ${train_output_path} speedyspeech_csmsc || exit -1
+fi
diff --git a/examples/csmsc/tts3/local/PTQ_dynamic.sh b/examples/csmsc/tts3/local/PTQ_dynamic.sh
@@ -0,0 +1,8 @@
+train_output_path=$1
+model_name=$2
+weight_bits=$3
+
+python3 ${BIN_DIR}/../PTQ_dynamic.py \
+    --inference_dir ${train_output_path}/inference \
+    --model_name ${model_name} \
+    --weight_bits ${weight_bits}
diff --git a/examples/csmsc/tts3/local/PTQ_static.sh b/examples/csmsc/tts3/local/PTQ_static.sh
@@ -0,0 +1,8 @@
+train_output_path=$1
+model_name=$2
+
+python3 ${BIN_DIR}/../PTQ_static.py \
+    --dev-metadata=dump/dev/norm/metadata.jsonl \
+    --inference_dir ${train_output_path}/inference \
+    --model_name ${model_name} \
+    --onnx_forma=True
diff --git a/examples/csmsc/tts3/run.sh b/examples/csmsc/tts3/run.sh
@@ -76,3 +76,16 @@ fi
 if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
     CUDA_VISIBLE_DEVICES=${gpus} ./local/lite_predict.sh ${train_output_path} || exit -1
 fi
+
+# PTQ_dynamic
+if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then
+    ./local/PTQ_dynamic.sh  ${train_output_path} fastspeech2_csmsc 8
+    # ./local/PTQ_dynamic.sh  ${train_output_path} pwgan_csmsc 8
+    # ./local/PTQ_dynamic.sh  ${train_output_path} mb_melgan_csmsc 8
+    # ./local/PTQ_dynamic.sh  ${train_output_path} hifigan_csmsc 8
+fi
+
+# PTQ_static
+if [ ${stage} -le 10 ] && [ ${stop_stage} -ge 10 ]; then
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/PTQ_static.sh  ${train_output_path} fastspeech2_csmsc || exit -1
+fi
diff --git a/examples/csmsc/tts3/run_cnndecoder.sh b/examples/csmsc/tts3/run_cnndecoder.sh
@@ -122,3 +122,8 @@ fi
 if [ ${stage} -le 14 ] && [ ${stop_stage} -ge 14 ]; then
     CUDA_VISIBLE_DEVICES=${gpus} ./local/lite_predict_streaming.sh ${train_output_path} || exit -1
 fi
+
+# PTQ_static
+if [ ${stage} -le 15 ] && [ ${stop_stage} -ge 15 ]; then
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/PTQ_static.sh  ${train_output_path} fastspeech2_csmsc || exit -1
+fi
diff --git a/examples/csmsc/voc1/local/PTQ_static.sh b/examples/csmsc/voc1/local/PTQ_static.sh
@@ -0,0 +1,8 @@
+train_output_path=$1
+model_name=$2
+
+python3 ${BIN_DIR}/../../PTQ_static.py \
+    --dev-metadata=dump/dev/norm/metadata.jsonl \
+    --inference_dir ${train_output_path}/inference \
+    --model_name ${model_name} \
+    --onnx_format=True 
diff --git a/examples/csmsc/voc1/run.sh b/examples/csmsc/voc1/run.sh
@@ -30,3 +30,8 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     # synthesize
     CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
 fi
+
+# PTQ_static
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/PTQ_static.sh  ${train_output_path} pwgan_csmsc || exit -1
+fi
diff --git a/examples/csmsc/voc3/local/PTQ_static.sh b/examples/csmsc/voc3/local/PTQ_static.sh
@@ -0,0 +1 @@
+../../voc1/local/PTQ_static.sh
diff --git a/examples/csmsc/voc3/run.sh b/examples/csmsc/voc3/run.sh
@@ -30,3 +30,8 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     # synthesize
     CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
 fi
+
+# PTQ_static
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/PTQ_static.sh  ${train_output_path} mb_melgan_csmsc || exit -1
+fi
diff --git a/examples/csmsc/voc5/local/PTQ_static.sh b/examples/csmsc/voc5/local/PTQ_static.sh
@@ -0,0 +1 @@
+../../voc1/local/PTQ_static.sh
diff --git a/examples/csmsc/voc5/run.sh b/examples/csmsc/voc5/run.sh
@@ -30,3 +30,8 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     # synthesize
     CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
 fi
+
+# PTQ_static
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/PTQ_static.sh  ${train_output_path} hifigan_csmsc || exit -1
+fi
diff --git a/paddlespeech/t2s/datasets/am_batch_fn.py b/paddlespeech/t2s/datasets/am_batch_fn.py
@@ -538,3 +538,70 @@ def vits_multi_spk_batch_fn(examples):
         spk_id = paddle.to_tensor(spk_id)
         batch["spk_id"] = spk_id
     return batch
+
+
+# for PaddleSlim
+def fastspeech2_single_spk_batch_fn_static(examples):
+    text = [np.array(item["text"], dtype=np.int64) for item in examples]
+    text = np.array(text)
+    # do not need batch axis in infer
+    text = text[0]
+    batch = {
+        "text": text,
+    }
+    return batch
+
+
+def fastspeech2_multi_spk_batch_fn_static(examples):
+    text = [np.array(item["text"], dtype=np.int64) for item in examples]
+    text = np.array(text)
+    text = text[0]
+    batch = {
+        "text": text,
+    }
+    if "spk_id" in examples[0]:
+        spk_id = [np.array(item["spk_id"], dtype=np.int64) for item in examples]
+        spk_id = np.array(spk_id)
+        spk_id = spk_id[0]
+        batch["spk_id"] = spk_id
+    if "spk_emb" in examples[0]:
+        spk_emb = [
+            np.array(item["spk_emb"], dtype=np.float32) for item in examples
+        ]
+        spk_emb = np.array(spk_emb)
+        spk_emb = spk_id[spk_emb]
+        batch["spk_emb"] = spk_emb
+    return batch
+
+
+def speedyspeech_single_spk_batch_fn_static(examples):
+    phones = [np.array(item["phones"], dtype=np.int64) for item in examples]
+    tones = [np.array(item["tones"], dtype=np.int64) for item in examples]
+    phones = np.array(phones)
+    tones = np.array(tones)
+    phones = phones[0]
+    tones = tones[0]
+    batch = {
+        "phones": phones,
+        "tones": tones,
+    }
+    return batch
+
+
+def speedyspeech_multi_spk_batch_fn_static(examples):
+    phones = [np.array(item["phones"], dtype=np.int64) for item in examples]
+    tones = [np.array(item["tones"], dtype=np.int64) for item in examples]
+    phones = np.array(phones)
+    tones = np.array(tones)
+    phones = phones[0]
+    tones = tones[0]
+    batch = {
+        "phones": phones,
+        "tones": tones,
+    }
+    if "spk_id" in examples[0]:
+        spk_id = [np.array(item["spk_id"], dtype=np.int64) for item in examples]
+        spk_id = np.array(spk_id)
+        spk_id = spk_id[0]
+        batch["spk_id"] = spk_id
+    return batch
diff --git a/paddlespeech/t2s/datasets/vocoder_batch_fn.py b/paddlespeech/t2s/datasets/vocoder_batch_fn.py
@@ -55,13 +55,12 @@ def __call__(self, batch):
         Args:
             batch (list): list of tuple of the pair of audio and features. Audio shape (T, ), features shape(T', C).
 
-        Returns: 
+        Returns:
+            Tensor:
+                Target signal batch (B, 1, T).
             Tensor:
                 Auxiliary feature batch (B, C, T'), where
                 T = (T' - 2 * aux_context_window) * hop_size.
-            Tensor:
-                Target signal batch (B, 1, T).
-
         """
         # check length
         batch = [
@@ -106,11 +105,7 @@ def _adjust_length(self, x, c):
         if len(x) < c.shape[0] * self.hop_size:
             x = np.pad(x, (0, c.shape[0] * self.hop_size - len(x)), mode="edge")
         elif len(x) > c.shape[0] * self.hop_size:
-            # print(
-            #     f"wave length: ({len(x)}), mel length: ({c.shape[0]}), hop size: ({self.hop_size })"
-            # )
             x = x[:c.shape[0] * self.hop_size]
-
         # check the legnth is valid
         assert len(x) == c.shape[
             0] * self.hop_size, f"wave length: ({len(x)}), mel length: ({c.shape[0]})"
@@ -218,3 +213,47 @@ def __call__(self, batch):
             y = label_2_float(paddle.cast(y, dtype='float32'), self.bits)
 
         return x, y, mels
+
+
+# for paddleslim
+
+
+class Clip_static(Clip):
+    """Collate functor for training vocoders.
+    """
+
+    def __call__(self, batch):
+        """Convert into batch tensors.
+
+        Args:
+            batch (list): list of tuple of the pair of audio and features. Audio shape (T, ), features shape(T', C).
+
+        Returns: 
+            Dict[str, np.array]:
+                Auxiliary feature batch (B, C, T'), where
+                T = (T' - 2 * aux_context_window) * hop_size.
+        """
+        # check length
+        batch = [
+            self._adjust_length(b['wave'], b['feats']) for b in batch
+            if b['feats'].shape[0] > self.mel_threshold
+        ]
+        xs, cs = [b[0] for b in batch], [b[1] for b in batch]
+
+        # make batch with random cut
+        c_lengths = [c.shape[0] for c in cs]
+        start_frames = np.array([
+            np.random.randint(self.start_offset, cl + self.end_offset)
+            for cl in c_lengths
+        ])
+
+        c_starts = start_frames - self.aux_context_window
+        c_ends = start_frames + self.batch_max_frames + self.aux_context_window
+        c_batch = np.stack(
+            [c[start:end] for c, start, end in zip(cs, c_starts, c_ends)])
+        # infer axis (T',C) is different with train axis (B, C, T')
+        # c_batch = c_batch.transpose([0, 2, 1])  # (B, C, T')
+        # do not need batch axis in infer
+        c_batch = c_batch[0]
+        batch = {"logmel": c_batch}
+        return batch
diff --git a/paddlespeech/t2s/exps/PTQ_dynamic.py b/paddlespeech/t2s/exps/PTQ_dynamic.py
@@ -0,0 +1,80 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+
+import paddle
+from paddleslim.quant import quant_post_dynamic
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Paddle Slim Dynamic with acoustic model & vocoder.")
+    # acoustic model
+    parser.add_argument(
+        '--model_name',
+        type=str,
+        default='fastspeech2_csmsc',
+        choices=[
+            'speedyspeech_csmsc',
+            'fastspeech2_csmsc',
+            'fastspeech2_aishell3',
+            'fastspeech2_ljspeech',
+            'fastspeech2_vctk',
+            'tacotron2_csmsc',
+            'fastspeech2_mix',
+            'pwgan_csmsc',
+            'pwgan_aishell3',
+            'pwgan_ljspeech',
+            'pwgan_vctk',
+            'mb_melgan_csmsc',
+            'hifigan_csmsc',
+            'hifigan_aishell3',
+            'hifigan_ljspeech',
+            'hifigan_vctk',
+            'wavernn_csmsc',
+        ],
+        help='Choose model type of tts task.')
+
+    parser.add_argument(
+        "--inference_dir", type=str, help="dir to save inference models")
+    parser.add_argument(
+        "--weight_bits",
+        type=int,
+        default=8,
+        choices=[8, 16],
+        help="The bits for the quantized weight, and it should be 8 or 16. Default is 8.",
+    )
+
+    args, _ = parser.parse_known_args()
+    return args
+
+
+# only inference for models trained with csmsc now
+def main():
+    args = parse_args()
+    paddle.enable_static()
+    quant_post_dynamic(
+        model_dir=args.inference_dir,
+        save_model_dir=args.inference_dir,
+        model_filename=args.model_name + ".pdmodel",
+        params_filename=args.model_name + ".pdiparams",
+        save_model_filename=args.model_name + "_" + str(args.weight_bits) +
+        "bits.pdmodel",
+        save_params_filename=args.model_name + "_" + str(args.weight_bits) +
+        "bits.pdiparams",
+        weight_bits=args.weight_bits, )
+
+
+if __name__ == "__main__":
+    main()