From 36a3a9bf8bf071e27692acf50dcc370b5e00114c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 6 Jul 2023 09:59:14 +0200 Subject: [PATCH 1/2] Export multispeaker onnx --- TTS/tts/models/vits.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index bc96f5dc35..7c11ba583a 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -1875,7 +1875,7 @@ def onnx_inference(text, text_lengths, scales, sid=None): def load_onnx(self, model_path: str, cuda=False): import onnxruntime as ort - providers = ["CPUExecutionProvider" if cuda is False else "CUDAExecutionProvider"] + providers = ["CPUExecutionProvider" if cuda is False else ("CUDAExecutionProvider", {"cudnn_conv_algo_search": "DEFAULT"})] sess_options = ort.SessionOptions() self.onnx_sess = ort.InferenceSession( model_path, @@ -1883,10 +1883,8 @@ def load_onnx(self, model_path: str, cuda=False): providers=providers, ) - def inference_onnx(self, x, x_lengths=None): - """ONNX inference (only single speaker models are supported) - - TODO: implement multi speaker support. + def inference_onnx(self, x, x_lengths=None, speaker_id=None): + """ONNX inference """ if isinstance(x, torch.Tensor): @@ -1907,7 +1905,7 @@ def inference_onnx(self, x, x_lengths=None): "input": x, "input_lengths": x_lengths, "scales": scales, - "sid": None, + "sid": torch.tensor([speaker_id]).cpu().numpy(), }, ) return audio[0][0] From a4edc9b7777b120cddca2c0fb394caad3b35aeaf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Eren=20G=C3=B6lge?= Date: Thu, 6 Jul 2023 09:59:57 +0200 Subject: [PATCH 2/2] Make style --- TTS/tts/layers/bark/hubert/kmeans_hubert.py | 2 ++ TTS/tts/models/vits.py | 9 ++++++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/TTS/tts/layers/bark/hubert/kmeans_hubert.py b/TTS/tts/layers/bark/hubert/kmeans_hubert.py index c7724c233c..a6a3b9aeb1 100644 --- a/TTS/tts/layers/bark/hubert/kmeans_hubert.py +++ b/TTS/tts/layers/bark/hubert/kmeans_hubert.py @@ -15,6 +15,8 @@ from torch import nn from torchaudio.functional import resample from transformers import HubertModel + + def round_down_nearest_multiple(num, divisor): return num // divisor * divisor diff --git a/TTS/tts/models/vits.py b/TTS/tts/models/vits.py index 7c11ba583a..f4f4c6391f 100644 --- a/TTS/tts/models/vits.py +++ b/TTS/tts/models/vits.py @@ -1875,7 +1875,11 @@ def onnx_inference(text, text_lengths, scales, sid=None): def load_onnx(self, model_path: str, cuda=False): import onnxruntime as ort - providers = ["CPUExecutionProvider" if cuda is False else ("CUDAExecutionProvider", {"cudnn_conv_algo_search": "DEFAULT"})] + providers = [ + "CPUExecutionProvider" + if cuda is False + else ("CUDAExecutionProvider", {"cudnn_conv_algo_search": "DEFAULT"}) + ] sess_options = ort.SessionOptions() self.onnx_sess = ort.InferenceSession( model_path, @@ -1884,8 +1888,7 @@ def load_onnx(self, model_path: str, cuda=False): ) def inference_onnx(self, x, x_lengths=None, speaker_id=None): - """ONNX inference - """ + """ONNX inference""" if isinstance(x, torch.Tensor): x = x.cpu().numpy()