Skip to content

Commit

Permalink
[TTS]Add slim for TTS (#2729)
Browse files Browse the repository at this point in the history
  • Loading branch information
yt605155624 authored Dec 9, 2022
1 parent 6f927d5 commit 3f6afc4
Show file tree
Hide file tree
Showing 17 changed files with 513 additions and 8 deletions.
1 change: 1 addition & 0 deletions examples/csmsc/tts2/local/PTQ_static.sh
5 changes: 5 additions & 0 deletions examples/csmsc/tts2/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -72,3 +72,8 @@ fi
if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/lite_predict.sh ${train_output_path} || exit -1
fi

# PTQ_static
if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/PTQ_static.sh ${train_output_path} speedyspeech_csmsc || exit -1
fi
8 changes: 8 additions & 0 deletions examples/csmsc/tts3/local/PTQ_dynamic.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
train_output_path=$1
model_name=$2
weight_bits=$3

python3 ${BIN_DIR}/../PTQ_dynamic.py \
--inference_dir ${train_output_path}/inference \
--model_name ${model_name} \
--weight_bits ${weight_bits}
8 changes: 8 additions & 0 deletions examples/csmsc/tts3/local/PTQ_static.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
train_output_path=$1
model_name=$2

python3 ${BIN_DIR}/../PTQ_static.py \
--dev-metadata=dump/dev/norm/metadata.jsonl \
--inference_dir ${train_output_path}/inference \
--model_name ${model_name} \
--onnx_forma=True
13 changes: 13 additions & 0 deletions examples/csmsc/tts3/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -76,3 +76,16 @@ fi
if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/lite_predict.sh ${train_output_path} || exit -1
fi

# PTQ_dynamic
if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then
./local/PTQ_dynamic.sh ${train_output_path} fastspeech2_csmsc 8
# ./local/PTQ_dynamic.sh ${train_output_path} pwgan_csmsc 8
# ./local/PTQ_dynamic.sh ${train_output_path} mb_melgan_csmsc 8
# ./local/PTQ_dynamic.sh ${train_output_path} hifigan_csmsc 8
fi

# PTQ_static
if [ ${stage} -le 10 ] && [ ${stop_stage} -ge 10 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/PTQ_static.sh ${train_output_path} fastspeech2_csmsc || exit -1
fi
5 changes: 5 additions & 0 deletions examples/csmsc/tts3/run_cnndecoder.sh
Original file line number Diff line number Diff line change
Expand Up @@ -122,3 +122,8 @@ fi
if [ ${stage} -le 14 ] && [ ${stop_stage} -ge 14 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/lite_predict_streaming.sh ${train_output_path} || exit -1
fi

# PTQ_static
if [ ${stage} -le 15 ] && [ ${stop_stage} -ge 15 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/PTQ_static.sh ${train_output_path} fastspeech2_csmsc || exit -1
fi
8 changes: 8 additions & 0 deletions examples/csmsc/voc1/local/PTQ_static.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
train_output_path=$1
model_name=$2

python3 ${BIN_DIR}/../../PTQ_static.py \
--dev-metadata=dump/dev/norm/metadata.jsonl \
--inference_dir ${train_output_path}/inference \
--model_name ${model_name} \
--onnx_format=True
5 changes: 5 additions & 0 deletions examples/csmsc/voc1/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,8 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# synthesize
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi

# PTQ_static
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/PTQ_static.sh ${train_output_path} pwgan_csmsc || exit -1
fi
1 change: 1 addition & 0 deletions examples/csmsc/voc3/local/PTQ_static.sh
5 changes: 5 additions & 0 deletions examples/csmsc/voc3/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,8 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# synthesize
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi

# PTQ_static
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/PTQ_static.sh ${train_output_path} mb_melgan_csmsc || exit -1
fi
1 change: 1 addition & 0 deletions examples/csmsc/voc5/local/PTQ_static.sh
5 changes: 5 additions & 0 deletions examples/csmsc/voc5/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,8 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# synthesize
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi

# PTQ_static
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/PTQ_static.sh ${train_output_path} hifigan_csmsc || exit -1
fi
67 changes: 67 additions & 0 deletions paddlespeech/t2s/datasets/am_batch_fn.py
Original file line number Diff line number Diff line change
Expand Up @@ -538,3 +538,70 @@ def vits_multi_spk_batch_fn(examples):
spk_id = paddle.to_tensor(spk_id)
batch["spk_id"] = spk_id
return batch


# for PaddleSlim
def fastspeech2_single_spk_batch_fn_static(examples):
text = [np.array(item["text"], dtype=np.int64) for item in examples]
text = np.array(text)
# do not need batch axis in infer
text = text[0]
batch = {
"text": text,
}
return batch


def fastspeech2_multi_spk_batch_fn_static(examples):
text = [np.array(item["text"], dtype=np.int64) for item in examples]
text = np.array(text)
text = text[0]
batch = {
"text": text,
}
if "spk_id" in examples[0]:
spk_id = [np.array(item["spk_id"], dtype=np.int64) for item in examples]
spk_id = np.array(spk_id)
spk_id = spk_id[0]
batch["spk_id"] = spk_id
if "spk_emb" in examples[0]:
spk_emb = [
np.array(item["spk_emb"], dtype=np.float32) for item in examples
]
spk_emb = np.array(spk_emb)
spk_emb = spk_id[spk_emb]
batch["spk_emb"] = spk_emb
return batch


def speedyspeech_single_spk_batch_fn_static(examples):
phones = [np.array(item["phones"], dtype=np.int64) for item in examples]
tones = [np.array(item["tones"], dtype=np.int64) for item in examples]
phones = np.array(phones)
tones = np.array(tones)
phones = phones[0]
tones = tones[0]
batch = {
"phones": phones,
"tones": tones,
}
return batch


def speedyspeech_multi_spk_batch_fn_static(examples):
phones = [np.array(item["phones"], dtype=np.int64) for item in examples]
tones = [np.array(item["tones"], dtype=np.int64) for item in examples]
phones = np.array(phones)
tones = np.array(tones)
phones = phones[0]
tones = tones[0]
batch = {
"phones": phones,
"tones": tones,
}
if "spk_id" in examples[0]:
spk_id = [np.array(item["spk_id"], dtype=np.int64) for item in examples]
spk_id = np.array(spk_id)
spk_id = spk_id[0]
batch["spk_id"] = spk_id
return batch
55 changes: 47 additions & 8 deletions paddlespeech/t2s/datasets/vocoder_batch_fn.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,13 +55,12 @@ def __call__(self, batch):
Args:
batch (list): list of tuple of the pair of audio and features. Audio shape (T, ), features shape(T', C).
Returns:
Returns:
Tensor:
Target signal batch (B, 1, T).
Tensor:
Auxiliary feature batch (B, C, T'), where
T = (T' - 2 * aux_context_window) * hop_size.
Tensor:
Target signal batch (B, 1, T).
"""
# check length
batch = [
Expand Down Expand Up @@ -106,11 +105,7 @@ def _adjust_length(self, x, c):
if len(x) < c.shape[0] * self.hop_size:
x = np.pad(x, (0, c.shape[0] * self.hop_size - len(x)), mode="edge")
elif len(x) > c.shape[0] * self.hop_size:
# print(
# f"wave length: ({len(x)}), mel length: ({c.shape[0]}), hop size: ({self.hop_size })"
# )
x = x[:c.shape[0] * self.hop_size]

# check the legnth is valid
assert len(x) == c.shape[
0] * self.hop_size, f"wave length: ({len(x)}), mel length: ({c.shape[0]})"
Expand Down Expand Up @@ -218,3 +213,47 @@ def __call__(self, batch):
y = label_2_float(paddle.cast(y, dtype='float32'), self.bits)

return x, y, mels


# for paddleslim


class Clip_static(Clip):
"""Collate functor for training vocoders.
"""

def __call__(self, batch):
"""Convert into batch tensors.
Args:
batch (list): list of tuple of the pair of audio and features. Audio shape (T, ), features shape(T', C).
Returns:
Dict[str, np.array]:
Auxiliary feature batch (B, C, T'), where
T = (T' - 2 * aux_context_window) * hop_size.
"""
# check length
batch = [
self._adjust_length(b['wave'], b['feats']) for b in batch
if b['feats'].shape[0] > self.mel_threshold
]
xs, cs = [b[0] for b in batch], [b[1] for b in batch]

# make batch with random cut
c_lengths = [c.shape[0] for c in cs]
start_frames = np.array([
np.random.randint(self.start_offset, cl + self.end_offset)
for cl in c_lengths
])

c_starts = start_frames - self.aux_context_window
c_ends = start_frames + self.batch_max_frames + self.aux_context_window
c_batch = np.stack(
[c[start:end] for c, start, end in zip(cs, c_starts, c_ends)])
# infer axis (T',C) is different with train axis (B, C, T')
# c_batch = c_batch.transpose([0, 2, 1]) # (B, C, T')
# do not need batch axis in infer
c_batch = c_batch[0]
batch = {"logmel": c_batch}
return batch
80 changes: 80 additions & 0 deletions paddlespeech/t2s/exps/PTQ_dynamic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse

import paddle
from paddleslim.quant import quant_post_dynamic


def parse_args():
parser = argparse.ArgumentParser(
description="Paddle Slim Dynamic with acoustic model & vocoder.")
# acoustic model
parser.add_argument(
'--model_name',
type=str,
default='fastspeech2_csmsc',
choices=[
'speedyspeech_csmsc',
'fastspeech2_csmsc',
'fastspeech2_aishell3',
'fastspeech2_ljspeech',
'fastspeech2_vctk',
'tacotron2_csmsc',
'fastspeech2_mix',
'pwgan_csmsc',
'pwgan_aishell3',
'pwgan_ljspeech',
'pwgan_vctk',
'mb_melgan_csmsc',
'hifigan_csmsc',
'hifigan_aishell3',
'hifigan_ljspeech',
'hifigan_vctk',
'wavernn_csmsc',
],
help='Choose model type of tts task.')

parser.add_argument(
"--inference_dir", type=str, help="dir to save inference models")
parser.add_argument(
"--weight_bits",
type=int,
default=8,
choices=[8, 16],
help="The bits for the quantized weight, and it should be 8 or 16. Default is 8.",
)

args, _ = parser.parse_known_args()
return args


# only inference for models trained with csmsc now
def main():
args = parse_args()
paddle.enable_static()
quant_post_dynamic(
model_dir=args.inference_dir,
save_model_dir=args.inference_dir,
model_filename=args.model_name + ".pdmodel",
params_filename=args.model_name + ".pdiparams",
save_model_filename=args.model_name + "_" + str(args.weight_bits) +
"bits.pdmodel",
save_params_filename=args.model_name + "_" + str(args.weight_bits) +
"bits.pdiparams",
weight_bits=args.weight_bits, )


if __name__ == "__main__":
main()
Loading

0 comments on commit 3f6afc4

Please sign in to comment.