diff --git a/TTS/bin/synthesize.py b/TTS/bin/synthesize.py index f82931450f..eadc308463 100755 --- a/TTS/bin/synthesize.py +++ b/TTS/bin/synthesize.py @@ -3,6 +3,7 @@ import argparse import sys +import contextlib from argparse import RawTextHelpFormatter # pylint: disable=redefined-outer-name, unused-argument @@ -241,7 +242,7 @@ def main(): default=None, ) parser.add_argument( - "--play", + "--pipe_out", help="Play the generated TTS wav.", type=str2bool, nargs="?", @@ -361,179 +362,177 @@ def main(): if not any(check_args): parser.parse_args(["-h"]) - # Late-import to make things load faster - from TTS.api import TTS - from TTS.utils.manage import ModelManager - from TTS.utils.synthesizer import Synthesizer - - # load model manager - path = Path(__file__).parent / "../.models.json" - manager = ModelManager(path, progress_bar=args.progress_bar) - api = TTS() - - tts_path = None - tts_config_path = None - speakers_file_path = None - language_ids_file_path = None - vocoder_path = None - vocoder_config_path = None - encoder_path = None - encoder_config_path = None - vc_path = None - vc_config_path = None - model_dir = None - - # CASE1 #list : list pre-trained TTS models - if args.list_models: - manager.add_cs_api_models(api.list_models()) - manager.list_models() - sys.exit() - - # CASE2 #info : model info for pre-trained TTS models - if args.model_info_by_idx: - model_query = args.model_info_by_idx - manager.model_info_by_idx(model_query) - sys.exit() - - if args.model_info_by_name: - model_query_full_name = args.model_info_by_name - manager.model_info_by_full_name(model_query_full_name) - sys.exit() - - # CASE3: TTS with coqui studio models - if "coqui_studio" in args.model_name: - print(" > Using 🐸Coqui Studio model: ", args.model_name) - api = TTS(model_name=args.model_name, cs_api_model=args.cs_model) - api.tts_to_file( - text=args.text, - emotion=args.emotion, - file_path=args.out_path, - language=args.language, - speed=args.speed, - play=args.play, - ) - if args.play: - print(" > Played TTS wav") - print(" > Saving output to ", args.out_path) - return - - # CASE4: load pre-trained model paths - if args.model_name is not None and not args.model_path: - model_path, config_path, model_item = manager.download_model(args.model_name) - # tts model - if model_item["model_type"] == "tts_models": - tts_path = model_path - tts_config_path = config_path - if "default_vocoder" in model_item: - args.vocoder_name = model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name - - # voice conversion model - if model_item["model_type"] == "voice_conversion_models": - vc_path = model_path - vc_config_path = config_path - - # tts model with multiple files to be loaded from the directory path - if model_item.get("author", None) == "fairseq" or isinstance(model_item["model_url"], list): - model_dir = model_path - tts_path = None - tts_config_path = None - args.vocoder_name = None - - # load vocoder - if args.vocoder_name is not None and not args.vocoder_path: - vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name) - - # CASE5: set custom model paths - if args.model_path is not None: - tts_path = args.model_path - tts_config_path = args.config_path - speakers_file_path = args.speakers_file_path - language_ids_file_path = args.language_ids_file_path - - if args.vocoder_path is not None: - vocoder_path = args.vocoder_path - vocoder_config_path = args.vocoder_config_path - - if args.encoder_path is not None: - encoder_path = args.encoder_path - encoder_config_path = args.encoder_config_path - - device = args.device - if args.use_cuda: - device = "cuda" - - # load models - synthesizer = Synthesizer( - tts_path, - tts_config_path, - speakers_file_path, - language_ids_file_path, - vocoder_path, - vocoder_config_path, - encoder_path, - encoder_config_path, - vc_path, - vc_config_path, - model_dir, - args.voice_dir, - ).to(device) - - # query speaker ids of a multi-speaker model. - if args.list_speaker_idxs: - print( - " > Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model." - ) - print(synthesizer.tts_model.speaker_manager.name_to_id) - return - - # query langauge ids of a multi-lingual model. - if args.list_language_idxs: - print( - " > Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model." - ) - print(synthesizer.tts_model.language_manager.name_to_id) - return - - # check the arguments against a multi-speaker model. - if synthesizer.tts_speakers_file and (not args.speaker_idx and not args.speaker_wav): - print( - " [!] Looks like you use a multi-speaker model. Define `--speaker_idx` to " - "select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`." - ) - return - - # RUN THE SYNTHESIS - if args.text: - print(" > Text: {}".format(args.text)) - - # kick it - if tts_path is not None: - wav = synthesizer.tts( - args.text, - speaker_name=args.speaker_idx, - language_name=args.language_idx, - speaker_wav=args.speaker_wav, - reference_wav=args.reference_wav, - style_wav=args.capacitron_style_wav, - style_text=args.capacitron_style_text, - reference_speaker_name=args.reference_speaker_idx, - ) - elif vc_path is not None: - wav = synthesizer.voice_conversion( - source_wav=args.source_wav, - target_wav=args.target_wav, - ) - elif model_dir is not None: - wav = synthesizer.tts( - args.text, speaker_name=args.speaker_idx, language_name=args.language_idx, speaker_wav=args.speaker_wav - ) - - if args.play: - print(" > Playing TTS wav") - - # save the results - print(" > Saving output to {}".format(args.out_path)) - synthesizer.save_wav(wav, args.out_path, play=args.play) + pipe_out = sys.stdout if args.pipe_out else None + + with contextlib.redirect_stdout(None if args.pipe_out else sys.stdout): + # Late-import to make things load faster + from TTS.api import TTS + from TTS.utils.manage import ModelManager + from TTS.utils.synthesizer import Synthesizer + + # load model manager + path = Path(__file__).parent / "../.models.json" + manager = ModelManager(path, progress_bar=args.progress_bar) + api = TTS() + + tts_path = None + tts_config_path = None + speakers_file_path = None + language_ids_file_path = None + vocoder_path = None + vocoder_config_path = None + encoder_path = None + encoder_config_path = None + vc_path = None + vc_config_path = None + model_dir = None + + # CASE1 #list : list pre-trained TTS models + if args.list_models: + manager.add_cs_api_models(api.list_models()) + manager.list_models() + sys.exit() + + # CASE2 #info : model info for pre-trained TTS models + if args.model_info_by_idx: + model_query = args.model_info_by_idx + manager.model_info_by_idx(model_query) + sys.exit() + + if args.model_info_by_name: + model_query_full_name = args.model_info_by_name + manager.model_info_by_full_name(model_query_full_name) + sys.exit() + + # CASE3: TTS with coqui studio models + if "coqui_studio" in args.model_name: + print(" > Using 🐸Coqui Studio model: ", args.model_name) + api = TTS(model_name=args.model_name, cs_api_model=args.cs_model) + api.tts_to_file( + text=args.text, + emotion=args.emotion, + file_path=args.out_path, + language=args.language, + speed=args.speed, + play=pipe_out, + ) + print(" > Saving output to ", args.out_path) + return + + # CASE4: load pre-trained model paths + if args.model_name is not None and not args.model_path: + model_path, config_path, model_item = manager.download_model(args.model_name) + # tts model + if model_item["model_type"] == "tts_models": + tts_path = model_path + tts_config_path = config_path + if "default_vocoder" in model_item: + args.vocoder_name = model_item["default_vocoder"] if args.vocoder_name is None else args.vocoder_name + + # voice conversion model + if model_item["model_type"] == "voice_conversion_models": + vc_path = model_path + vc_config_path = config_path + + # tts model with multiple files to be loaded from the directory path + if model_item.get("author", None) == "fairseq" or isinstance(model_item["model_url"], list): + model_dir = model_path + tts_path = None + tts_config_path = None + args.vocoder_name = None + + # load vocoder + if args.vocoder_name is not None and not args.vocoder_path: + vocoder_path, vocoder_config_path, _ = manager.download_model(args.vocoder_name) + + # CASE5: set custom model paths + if args.model_path is not None: + tts_path = args.model_path + tts_config_path = args.config_path + speakers_file_path = args.speakers_file_path + language_ids_file_path = args.language_ids_file_path + + if args.vocoder_path is not None: + vocoder_path = args.vocoder_path + vocoder_config_path = args.vocoder_config_path + + if args.encoder_path is not None: + encoder_path = args.encoder_path + encoder_config_path = args.encoder_config_path + + device = args.device + if args.use_cuda: + device = "cuda" + + # load models + synthesizer = Synthesizer( + tts_path, + tts_config_path, + speakers_file_path, + language_ids_file_path, + vocoder_path, + vocoder_config_path, + encoder_path, + encoder_config_path, + vc_path, + vc_config_path, + model_dir, + args.voice_dir, + ).to(device) + + # query speaker ids of a multi-speaker model. + if args.list_speaker_idxs: + print( + " > Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model." + ) + print(synthesizer.tts_model.speaker_manager.name_to_id) + return + + # query langauge ids of a multi-lingual model. + if args.list_language_idxs: + print( + " > Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model." + ) + print(synthesizer.tts_model.language_manager.name_to_id) + return + + # check the arguments against a multi-speaker model. + if synthesizer.tts_speakers_file and (not args.speaker_idx and not args.speaker_wav): + print( + " [!] Looks like you use a multi-speaker model. Define `--speaker_idx` to " + "select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`." + ) + return + + # RUN THE SYNTHESIS + if args.text: + print(" > Text: {}".format(args.text)) + + # kick it + if tts_path is not None: + wav = synthesizer.tts( + args.text, + speaker_name=args.speaker_idx, + language_name=args.language_idx, + speaker_wav=args.speaker_wav, + reference_wav=args.reference_wav, + style_wav=args.capacitron_style_wav, + style_text=args.capacitron_style_text, + reference_speaker_name=args.reference_speaker_idx, + ) + elif vc_path is not None: + wav = synthesizer.voice_conversion( + source_wav=args.source_wav, + target_wav=args.target_wav, + ) + elif model_dir is not None: + wav = synthesizer.tts( + args.text, speaker_name=args.speaker_idx, language_name=args.language_idx, speaker_wav=args.speaker_wav + ) + + # save the results + print(" > Saving output to {}".format(args.out_path)) + synthesizer.save_wav(wav, args.out_path, play=pipe_out) if __name__ == "__main__": diff --git a/TTS/utils/audio/numpy_transforms.py b/TTS/utils/audio/numpy_transforms.py index a37737feb8..aa2f4fde4c 100644 --- a/TTS/utils/audio/numpy_transforms.py +++ b/TTS/utils/audio/numpy_transforms.py @@ -6,6 +6,7 @@ import simpleaudio as sa import soundfile as sf from librosa import magphase, pyin +from io import BytesIO # For using kwargs # pylint: disable=unused-argument @@ -428,7 +429,7 @@ def load_wav(*, filename: str, sample_rate: int = None, resample: bool = False, return x -def save_wav(*, wav: np.ndarray, path: str, sample_rate: int = None, play: bool = False, **kwargs) -> None: +def save_wav(*, wav: np.ndarray, path: str, sample_rate: int = None, play = None, **kwargs) -> None: """Save float waveform to a file using Scipy. Args: @@ -441,13 +442,12 @@ def save_wav(*, wav: np.ndarray, path: str, sample_rate: int = None, play: bool wav_norm = wav_norm.astype(np.int16) if play: - play_obj = sa.play_buffer(wav_norm, 1, 2, sample_rate) - + wav_buffer = BytesIO() + scipy.io.wavfile.write(wav_buffer, sample_rate, wav_norm) + wav_buffer.seek(0) + play.buffer.write(wav_buffer.read()) scipy.io.wavfile.write(path, sample_rate, wav_norm) - if play: - play_obj.wait_done() - def mulaw_encode(*, wav: np.ndarray, mulaw_qc: int, **kwargs) -> np.ndarray: mu = 2**mulaw_qc - 1 diff --git a/TTS/utils/audio/processor.py b/TTS/utils/audio/processor.py index 32569a4e48..0dd82e16d0 100644 --- a/TTS/utils/audio/processor.py +++ b/TTS/utils/audio/processor.py @@ -6,6 +6,7 @@ import scipy.signal import simpleaudio as sa import soundfile as sf +from io import BytesIO from TTS.tts.utils.helpers import StandardScaler from TTS.utils.audio.numpy_transforms import compute_f0 @@ -694,7 +695,7 @@ def load_wav(self, filename: str, sr: int = None) -> np.ndarray: x = self.rms_volume_norm(x, self.db_level) return x - def save_wav(self, wav: np.ndarray, path: str, sr: int = None, play: bool = False) -> None: + def save_wav(self, wav: np.ndarray, path: str, sr: int = None, play = None) -> None: """Save a waveform to a file using Scipy. Args: @@ -710,13 +711,12 @@ def save_wav(self, wav: np.ndarray, path: str, sr: int = None, play: bool = Fals wav_norm = wav_norm.astype(np.int16) if play: - play_obj = sa.play_buffer(wav_norm, 1, 2, self.sample_rate) - + wav_buffer = BytesIO() + scipy.io.wavfile.write(wav_buffer, sr if sr else self.sample_rate, wav_norm) + wav_buffer.seek(0) + play.buffer.write(wav_buffer.read()) scipy.io.wavfile.write(path, sr if sr else self.sample_rate, wav_norm) - if play: - play_obj.wait_done() - def get_duration(self, filename: str) -> float: """Get the duration of a wav file using Librosa.