From fb619e589d574abd7cbaa8c8d0f2e02ee6424be4 Mon Sep 17 00:00:00 2001 From: Spycsh <39623753+Spycsh@users.noreply.github.com> Date: Wed, 13 Sep 2023 21:32:52 +0800 Subject: [PATCH] Support HPU on ASR/TTS (#290) * hpu asr support * add hpu support for tts * enhance device condition * fix lint and coverage * remove env variables * remove cpu_pool --------- Co-authored-by: Haihao Shen --- .../neural_chat/pipeline/plugins/audio/tts.py | 29 ++++++------------- .../neural_chat/tests/audio/test_asr.py | 23 ++++++++++++--- .../neural_chat/tests/audio/test_tts.py | 19 ++++++++++-- 3 files changed, 45 insertions(+), 26 deletions(-) diff --git a/intel_extension_for_transformers/neural_chat/pipeline/plugins/audio/tts.py b/intel_extension_for_transformers/neural_chat/pipeline/plugins/audio/tts.py index 270d698ff37..76f1441c1a6 100644 --- a/intel_extension_for_transformers/neural_chat/pipeline/plugins/audio/tts.py +++ b/intel_extension_for_transformers/neural_chat/pipeline/plugins/audio/tts.py @@ -25,7 +25,6 @@ import soundfile as sf import numpy as np import contextlib -import intel_extension_for_pytorch as ipex from .utils.english_normalizer import EnglishNormalizer @@ -40,19 +39,18 @@ def __init__(self, output_audio_path="./response.wav", voice="default", stream_m asset_path="/intel-extension-for-transformers/intel_extension_for_transformers/neural_chat/assets"): """Make sure your export LD_PRELOAD= beforehand.""" # default setting - self.original_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") - self.processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") self.device = device + self.original_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(self.device) + self.processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") self.voice = voice self.output_audio_path = output_audio_path self.stream_mode = stream_mode self.spk_model_name = "speechbrain/spkrec-xvect-voxceleb" self.speaker_model = EncoderClassifier.from_hparams( source=self.spk_model_name, - run_opts={"device": self.device}, - savedir=os.path.join("/tmp", self.spk_model_name) - ) - self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") + run_opts={"device": "cpu"}, + savedir=os.path.join("/tmp", self.spk_model_name)) + self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(self.device) self.vocoder.eval() script_dir = os.path.dirname(os.path.abspath(__file__)) if os.path.exists(os.path.join(script_dir, '../../../assets/speaker_embeddings/spk_embed_default.pt')): @@ -77,15 +75,6 @@ def __init__(self, output_audio_path="./response.wav", voice="default", stream_m elif os.path.exists(os.path.join(asset_path, 'speaker_embeddings/spk_embed_male.pt')): self.male_speaker_embeddings = torch.load(os.path.join(asset_path, 'speaker_embeddings/spk_embed_male.pt')) - self.cpu_pool = None - if not torch.cuda.is_available(): - # ipex IOMP hardware resources - if 'LD_PRELOAD' in os.environ and 'libiomp' in os.environ['LD_PRELOAD']: - import intel_extension_for_pytorch as ipex - self.cpu_pool = ipex.cpu.runtime.CPUPool([i for i in range(24)]) - else: - print("Warning! You have not preloaded iomp beforehand and that may lead to performance issue") - self.normalizer = EnglishNormalizer() def create_speaker_embedding(self, driven_audio_path): @@ -97,10 +86,10 @@ def create_speaker_embedding(self, driven_audio_path): [driven_audio_path]}).cast_column("audio", Audio(sampling_rate=16000)) waveform = audio_dataset[0]["audio"]['array'] with torch.no_grad(): - speaker_embeddings = self.speaker_model.encode_batch(torch.tensor(waveform)) + speaker_embeddings = self.speaker_model.encode_batch(torch.tensor(waveform).to("cpu")) speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2) # [1,1,512] speaker_embeddings = speaker_embeddings[0] # [1,512] - return speaker_embeddings.cpu() + return speaker_embeddings.to(self.device) def _lookup_voice_embedding(self, voice, asset_path="/intel-extension-for-transformers/intel_extension_for_transformers/neural_chat/assets"): @@ -179,8 +168,8 @@ def text2speech(self, text, output_audio_path, voice="default", do_batch_tts=Fal for text_in in texts: inputs = self.processor(text=text_in, return_tensors="pt") with torch.no_grad(): - with ipex.cpu.runtime.pin(self.cpu_pool) if self.cpu_pool else contextlib.nullcontext(): - spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings) + spectrogram = model.generate_speech( + inputs["input_ids"].to(self.device), speaker_embeddings.to(self.device)) speech = self.vocoder(spectrogram) all_speech = np.concatenate([all_speech, speech.cpu().numpy()]) all_speech = np.concatenate([all_speech, np.array([0 for i in range(8000)])]) # pad after each end diff --git a/intel_extension_for_transformers/neural_chat/tests/audio/test_asr.py b/intel_extension_for_transformers/neural_chat/tests/audio/test_asr.py index 28eeca05807..20f6e93ebfd 100644 --- a/intel_extension_for_transformers/neural_chat/tests/audio/test_asr.py +++ b/intel_extension_for_transformers/neural_chat/tests/audio/test_asr.py @@ -23,10 +23,25 @@ class TestASR(unittest.TestCase): @classmethod def setUpClass(self): - device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - self.asr = AudioSpeechRecognition("openai/whisper-small", device=device) - if not torch.cuda.is_available(): + try: + import habana_frameworks.torch.hpu as hthpu + self.is_hpu_available = True + except ImportError: + self.is_hpu_available = False + try: + import intel_extension_for_pytorch as intel_ipex + self.is_ipex_available = True + except ImportError: + self.is_ipex_available = False + if self.is_hpu_available: + self.device = "hpu" + else: + self.device = "cuda" if torch.cuda.is_available() else "cpu" + self.asr = AudioSpeechRecognition("openai/whisper-small", device=self.device) + if self.device == "cpu" and self.is_ipex_available: self.asr_bf16 = AudioSpeechRecognition("openai/whisper-small", bf16=True) + else: + self.asr_bf16 = None def test_audio2text(self): audio_path = "/intel-extension-for-transformers/intel_extension_for_transformers/neural_chat/assets/audio/welcome.wav" @@ -37,7 +52,7 @@ def test_audio2text(self): self.assertEqual(text.lower(), "Welcome to Neural Chat".lower()) def test_audio2text_bf16(self): - if torch.cuda.is_available(): + if self.asr_bf16 is None: return audio_path = "/intel-extension-for-transformers/intel_extension_for_transformers/neural_chat/assets/audio/welcome.wav" audio_path = "/intel-extension-for-transformers/intel_extension_for_transformers/neural_chat/assets/audio/welcome.wav" diff --git a/intel_extension_for_transformers/neural_chat/tests/audio/test_tts.py b/intel_extension_for_transformers/neural_chat/tests/audio/test_tts.py index a094f37bf04..d93356c79e0 100644 --- a/intel_extension_for_transformers/neural_chat/tests/audio/test_tts.py +++ b/intel_extension_for_transformers/neural_chat/tests/audio/test_tts.py @@ -26,8 +26,23 @@ class TestTTS(unittest.TestCase): @classmethod def setUpClass(self): - self.tts = TextToSpeech(device=torch.device("cuda" if torch.cuda.is_available() else "cpu")) - self.asr = AudioSpeechRecognition("openai/whisper-small") + try: + import habana_frameworks.torch.hpu as hthpu + self.is_hpu_available = True + except ImportError: + self.is_hpu_available = False + try: + import intel_extension_for_pytorch as ipex + self.is_ipex_available = True + except ImportError: + self.is_ipex_available = False + if self.is_hpu_available: + self.device = "hpu" + else: + self.device = "cuda" if torch.cuda.is_available() else "cpu" + self.tts = TextToSpeech(device=self.device) + self.asr = AudioSpeechRecognition("openai/whisper-small", device=self.device) + shutil.rmtree('./tmp_audio', ignore_errors=True) os.mkdir('./tmp_audio') @classmethod