[NeuralChat] Fix audio plugin sample code issue and provide a way to …

…set tts/asr model path (#1342) * Fix audio plugin sample code issue and provide a way to set tts/asr mdel path Signed-off-by: lvliang-intel <liang1.lv@intel.com>
intel · Mar 5, 2024 · db7da09 · db7da09
1 parent b494142
commit db7da09
Show file tree

Hide file tree

Showing 3 changed files with 12 additions and 8 deletions.
diff --git a/intel_extension_for_transformers/neural_chat/pipeline/plugins/audio/README.md b/intel_extension_for_transformers/neural_chat/pipeline/plugins/audio/README.md
@@ -37,7 +37,7 @@ pip install transformers datasets pydub
 The AudioSpeechRecognition class provides functionality for converting English/Multiligual audio to text. Here's how to use it:
 
 ```python
-from intel_extension_for_transformers.neural_chat.pipeline.plugins.audio import AudioSpeechRecognition
+from intel_extension_for_transformers.neural_chat.pipeline.plugins.audio.asr import AudioSpeechRecognition
 # pass the parameter language="auto" to let the asr model automatically detect language
 # otherwise, you can pass an arbitrary language to the model (e.g. en/zh/de/fr)
 asr = AudioSpeechRecognition("openai/whisper-small", language="auto", device=self.device)
@@ -62,7 +62,7 @@ pip install transformers soundfile speechbrain==0.5.15
 The TextToSpeech class in your module provides the capability to convert English text to speech. Here's how to use it:
 
 ```python
-from intel_extension_for_transformers.neural_chat.pipeline.plugins.audio import TextToSpeech
+from intel_extension_for_transformers.neural_chat.pipeline.plugins.audio.tts import TextToSpeech
 tts = TextToSpeech()
 text_to_speak = "Hello, this is a sample text."  # Replace with your text
 output_audio_path = "./output.wav"  # Replace with the desired output audio path
@@ -85,7 +85,7 @@ pip install paddlespeech paddlepaddle
 The ChineseTextToSpeech class within your module provides functionality for TTS. Here's how to use it:
 
 ```python
-from intel_extension_for_transformers.neural_chat.pipeline.plugins.audio import ChineseTextToSpeech
+from intel_extension_for_transformers.neural_chat.pipeline.plugins.audio.tts_chinese import ChineseTextToSpeech
 # Initialize the TTS module
 tts = ChineseTextToSpeech()
 # Define the text you want to convert to speech

diff --git a/intel_extension_for_transformers/neural_chat/pipeline/plugins/audio/asr.py b/intel_extension_for_transformers/neural_chat/pipeline/plugins/audio/asr.py
@@ -16,6 +16,7 @@
 # limitations under the License.
 
 import torch
+import os
 from transformers import WhisperForConditionalGeneration, WhisperProcessor
 from datasets import Audio, Dataset
 import time
@@ -36,8 +37,9 @@ def __init__(self, model_name_or_path="openai/whisper-small", bf16=False, langua
         if device == "auto":
             device = get_device_type()
         self.device = device
-        self.model = WhisperForConditionalGeneration.from_pretrained(model_name_or_path).to(self.device)
-        self.processor = WhisperProcessor.from_pretrained(model_name_or_path)
+        asr_model_name_or_path = os.environ.get("ASR_MODEL_PATH", model_name_or_path)
+        self.model = WhisperForConditionalGeneration.from_pretrained(asr_model_name_or_path).to(self.device)
+        self.processor = WhisperProcessor.from_pretrained(asr_model_name_or_path)
         self.model.eval()
         self.bf16 = bf16
         if self.bf16:

diff --git a/intel_extension_for_transformers/neural_chat/pipeline/plugins/audio/tts.py b/intel_extension_for_transformers/neural_chat/pipeline/plugins/audio/tts.py
@@ -51,8 +51,9 @@ def __init__(self, output_audio_path="./response.wav", voice="default", stream_m
         if device == "auto":
             device = get_device_type()
         self.device = device
-        self.original_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(self.device)
-        self.processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
+        speecht5_model_name_or_path = os.environ.get("SPEECHT5_MODEL_PATH", "microsoft/speecht5_tts")
+        self.original_model = SpeechT5ForTextToSpeech.from_pretrained(speecht5_model_name_or_path).to(self.device)
+        self.processor = SpeechT5Processor.from_pretrained(speecht5_model_name_or_path)
         self.voice = voice
         self.output_audio_path = output_audio_path
         self.stream_mode = stream_mode
@@ -66,7 +67,8 @@ def __init__(self, output_audio_path="./response.wav", voice="default", stream_m
         except Exception as e: # pragma: no cover
             logging.warning("[TTS Warning] speaker model fail to load, so speaker embedding creating is disabled.")
             self.speaker_model = None
-        self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(self.device)
+        hifigan_model_name_or_path = os.environ.get("SPEECHT5_HIFIGAN_MODEL_PATH", "microsoft/speecht5_hifigan")
+        self.vocoder = SpeechT5HifiGan.from_pretrained(hifigan_model_name_or_path).to(self.device)
         self.vocoder.eval()
         script_dir = os.path.dirname(os.path.abspath(__file__))
         self.default_speaker_embedding = None