Skip to content
This repository has been archived by the owner on Oct 25, 2024. It is now read-only.

Commit

Permalink
[NeuralChat] Fix audio plugin sample code issue and provide a way to …
Browse files Browse the repository at this point in the history
…set tts/asr model path (#1342)

* Fix audio plugin sample code issue and provide a way to set tts/asr mdel path

Signed-off-by: lvliang-intel <liang1.lv@intel.com>
  • Loading branch information
lvliang-intel authored Mar 5, 2024
1 parent b494142 commit db7da09
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ pip install transformers datasets pydub
The AudioSpeechRecognition class provides functionality for converting English/Multiligual audio to text. Here's how to use it:

```python
from intel_extension_for_transformers.neural_chat.pipeline.plugins.audio import AudioSpeechRecognition
from intel_extension_for_transformers.neural_chat.pipeline.plugins.audio.asr import AudioSpeechRecognition
# pass the parameter language="auto" to let the asr model automatically detect language
# otherwise, you can pass an arbitrary language to the model (e.g. en/zh/de/fr)
asr = AudioSpeechRecognition("openai/whisper-small", language="auto", device=self.device)
Expand All @@ -62,7 +62,7 @@ pip install transformers soundfile speechbrain==0.5.15
The TextToSpeech class in your module provides the capability to convert English text to speech. Here's how to use it:

```python
from intel_extension_for_transformers.neural_chat.pipeline.plugins.audio import TextToSpeech
from intel_extension_for_transformers.neural_chat.pipeline.plugins.audio.tts import TextToSpeech
tts = TextToSpeech()
text_to_speak = "Hello, this is a sample text." # Replace with your text
output_audio_path = "./output.wav" # Replace with the desired output audio path
Expand All @@ -85,7 +85,7 @@ pip install paddlespeech paddlepaddle
The ChineseTextToSpeech class within your module provides functionality for TTS. Here's how to use it:

```python
from intel_extension_for_transformers.neural_chat.pipeline.plugins.audio import ChineseTextToSpeech
from intel_extension_for_transformers.neural_chat.pipeline.plugins.audio.tts_chinese import ChineseTextToSpeech
# Initialize the TTS module
tts = ChineseTextToSpeech()
# Define the text you want to convert to speech
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
# limitations under the License.

import torch
import os
from transformers import WhisperForConditionalGeneration, WhisperProcessor
from datasets import Audio, Dataset
import time
Expand All @@ -36,8 +37,9 @@ def __init__(self, model_name_or_path="openai/whisper-small", bf16=False, langua
if device == "auto":
device = get_device_type()
self.device = device
self.model = WhisperForConditionalGeneration.from_pretrained(model_name_or_path).to(self.device)
self.processor = WhisperProcessor.from_pretrained(model_name_or_path)
asr_model_name_or_path = os.environ.get("ASR_MODEL_PATH", model_name_or_path)
self.model = WhisperForConditionalGeneration.from_pretrained(asr_model_name_or_path).to(self.device)
self.processor = WhisperProcessor.from_pretrained(asr_model_name_or_path)
self.model.eval()
self.bf16 = bf16
if self.bf16:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,9 @@ def __init__(self, output_audio_path="./response.wav", voice="default", stream_m
if device == "auto":
device = get_device_type()
self.device = device
self.original_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(self.device)
self.processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
speecht5_model_name_or_path = os.environ.get("SPEECHT5_MODEL_PATH", "microsoft/speecht5_tts")
self.original_model = SpeechT5ForTextToSpeech.from_pretrained(speecht5_model_name_or_path).to(self.device)
self.processor = SpeechT5Processor.from_pretrained(speecht5_model_name_or_path)
self.voice = voice
self.output_audio_path = output_audio_path
self.stream_mode = stream_mode
Expand All @@ -66,7 +67,8 @@ def __init__(self, output_audio_path="./response.wav", voice="default", stream_m
except Exception as e: # pragma: no cover
logging.warning("[TTS Warning] speaker model fail to load, so speaker embedding creating is disabled.")
self.speaker_model = None
self.vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(self.device)
hifigan_model_name_or_path = os.environ.get("SPEECHT5_HIFIGAN_MODEL_PATH", "microsoft/speecht5_hifigan")
self.vocoder = SpeechT5HifiGan.from_pretrained(hifigan_model_name_or_path).to(self.device)
self.vocoder.eval()
script_dir = os.path.dirname(os.path.abspath(__file__))
self.default_speaker_embedding = None
Expand Down

0 comments on commit db7da09

Please sign in to comment.