OO style inference for the synthesizer, experimental support for low …

…memory GPUs
misbah4064 · Jun 25, 2019 · 8229b1d · 8229b1d
1 parent b9a072e
commit 8229b1d
Show file tree

Hide file tree

Showing 8 changed files with 169 additions and 97 deletions.
diff --git a/README.md b/README.md
@@ -18,6 +18,8 @@ SV2TTS is a three-stage deep learning framework that allows to create a numerica
 |[1712.05884](https://arxiv.org/pdf/1712.05884.pdf) | Tacotron 2 (synthesizer) | Natural TTS Synthesis by Conditioning Wavenet on Mel Spectrogram Predictions | [Rayhane-mamah/Tacotron-2](https://github.com/Rayhane-mamah/Tacotron-2)
 |[1710.10467](https://arxiv.org/pdf/1710.10467.pdf) | GE2E (encoder)| Generalized End-To-End Loss for Speaker Verification | This repo |
 
+## News
+**25/06/19:** Experimental support for low-memory GPUs (~2gb) added for the synthesizer. Pass `--low_mem` to `demo_cli.py` or `demo_toolbox.py` to enable it. It adds a big overhead, so it's not recommended if you have enough VRAM.
 
 
 ## Quick start

diff --git a/demo_cli.py b/demo_cli.py
@@ -1,6 +1,6 @@
 from encoder.params_model import model_embedding_size as speaker_embedding_size
 from utils.argutils import print_args
-from synthesizer import inference as synthesizer
+from synthesizer.inference import Synthesizer
 from encoder import inference as encoder
 from vocoder import inference as vocoder
 from pathlib import Path
@@ -28,6 +28,9 @@
     parser.add_argument("-v", "--voc_model_fpath", type=Path, 
                         default="vocoder/saved_models/pretrained/pretrained.pt",
                         help="Path to a saved vocoder")
+    parser.add_argument("--low_mem", action="store_true", help=\
+        "If True, the memory used by the synthesizer will be freed after each use. Adds large "
+        "overhead but allows to save some GPU memory for lower-end GPUs.")
     parser.add_argument("--no_sound", action="store_true", help=\
         "If True, audio won't be played.")
     args = parser.parse_args()
@@ -56,12 +59,10 @@
 
 
     ## Load the models one by one.
-    print("Loading the encoder, the synthesizer and the vocoder. This should take a few seconds. "
-          "The synthesizer will output a lot of stuff. Tensorflow is like that.")
+    print("Preparing the encoder, the synthesizer and the vocoder...")
     encoder.load_model(args.enc_model_fpath)
-    synthesizer.load_model(args.syn_model_dir.joinpath("taco_pretrained"))
+    synthesizer = Synthesizer(args.syn_model_dir.joinpath("taco_pretrained"), low_mem=args.low_mem)
     vocoder.load_model(args.voc_model_fpath)
-    print("\nAll models succesfully loaded!\n")
 
 
     ## Run a test
@@ -87,7 +88,7 @@
     # illustrate that
     embeds = [embed, np.zeros(speaker_embedding_size)]
     texts = ["test 1", "test 2"]
-    print("\tTesting the synthesizer...")
+    print("\tTesting the synthesizer... (loading the model will output a lot of text)")
     mels = synthesizer.synthesize_spectrograms(texts, embeds)
 
     # The vocoder synthesizes one waveform at a time, but it's more efficient for long ones. We 

diff --git a/demo_toolbox.py b/demo_toolbox.py
@@ -22,6 +22,9 @@
                         help="Directory containing saved synthesizer models")
     parser.add_argument("-v", "--voc_models_dir", type=Path, default="vocoder/saved_models", 
                         help="Directory containing saved vocoder models")
+    parser.add_argument("--low_mem", action="store_true", help=\
+        "If True, the memory used by the synthesizer will be freed after each use. Adds large "
+        "overhead but allows to save some GPU memory for lower-end GPUs.")
     args = parser.parse_args()
 
     # Launch the toolbox

diff --git a/requirements.txt b/requirements.txt
@@ -12,3 +12,4 @@ Unidecode
 inflect
 PyQt5
 multiprocess
+numba
diff --git a/synthesizer/inference.py b/synthesizer/inference.py
@@ -1,69 +1,137 @@
+from synthesizer.tacotron2 import Tacotron2
 from synthesizer.hparams import hparams
-from synthesizer.synthesizer import Synthesizer
+from multiprocess.pool import Pool  # You're free to use either one
+#from multiprocessing import Pool   # 
 from synthesizer import audio
 from pathlib import Path
 from typing import Union, List
 import tensorflow as tf
 import numpy as np
+import numba.cuda
 import librosa
 
-_model = None   # type: Synthesizer
-sample_rate = hparams.sample_rate
 
-# TODO: allow for custom hparams throughout this module?
-
-def load_model(checkpoints_dir: Path):
-    global _model
+class Synthesizer:
+    sample_rate = hparams.sample_rate
+    hparams = hparams
 
-    tf.reset_default_graph()
-    _model = Synthesizer()
-    checkpoint_fpath = tf.train.get_checkpoint_state(checkpoints_dir).model_checkpoint_path
-    _model.load(checkpoint_fpath, hparams)
+    def __init__(self, checkpoints_dir: Path, verbose=True, low_mem=False):
+        """
+        Creates a synthesizer ready for inference. The actual model isn't loaded in memory until
+        needed or until load() is called.
+        
+        :param checkpoints_dir: path to the directory containing the checkpoint file as well as the
+        weight files (.data, .index and .meta files)
+        :param verbose: if False, only tensorflow's output will be printed TODO: suppress them too
+        :param low_mem: if True, the model will be loaded in a separate process and its resources 
+        will be released after each usage. Adds a large overhead, only recommended if your GPU 
+        memory is low (<= 2gb)
+        """
+        self.verbose = verbose
+        self._low_mem = low_mem
+
+        # Prepare the model
+        self._model = None  # type: Tacotron2
+        checkpoint_state = tf.train.get_checkpoint_state(checkpoints_dir)
+        if checkpoint_state is None:
+            raise Exception("Could not find any synthesizer weights under %s" % checkpoints_dir)
+        self.checkpoint_fpath = checkpoint_state.model_checkpoint_path
+        if verbose:
+            model_name = checkpoints_dir.parent.name.replace("logs-", "")
+            step = int(self.checkpoint_fpath[self.checkpoint_fpath.rfind('-') + 1:])
+            print("Found synthesizer \"%s\" trained to step %d" % (model_name, step))
+
+    def is_loaded(self):
+        """
+        Whether the model is loaded in GPU memory.
+        """
+        return self._model is not None
 
-    model_name = checkpoints_dir.parent.name.replace("logs-", "")
-    step = int(checkpoint_fpath[checkpoint_fpath.rfind('-') + 1:])
-    print("Loaded synthesizer \"%s\" trained to step %d" % (model_name, step))
-
-def is_loaded():
-    return _model is not None
+    def load(self):
+        """
+        Effectively loads the model to GPU memory given the weights file that was passed in the
+        constructor.
+        """
+        if self._low_mem:
+            raise Exception("Cannot load the synthesizer permanently in low mem mode")
+        tf.reset_default_graph()
+        self._model = Tacotron2(self.checkpoint_fpath, hparams)
+
+    def synthesize_spectrograms(self, texts: List[str],
+                                embeddings: Union[np.ndarray, List[np.ndarray]],
+                                return_alignments=False):
+        """
+        Synthesizes mel spectrograms from texts and speaker embeddings.
 
-def synthesize_spectrograms(texts: List[str], embeddings: np.ndarray, return_alignments=False):
-    """
-    Synthesizes mel spectrograms from texts and speaker embeddings.
-    
-    :param texts: a list of N text prompts to be synthesized
-    :param embeddings: a numpy array of (N, 256) speaker embeddings
-    :param return_alignments: if True, a matrix representing the alignments between the characters
-    and each decoder output step will be returned for each spectrogram
-    :return: a list of N melspectrograms as numpy arrays of shape (80, M), and possibly the 
-    alignments.
-    """
-    if not is_loaded():
-        raise Exception("Load a model first")
-
-    specs, alignments = _model.my_synthesize(embeddings, texts)
+        :param texts: a list of N text prompts to be synthesized
+        :param embeddings: a numpy array or list of speaker embeddings of shape (N, 256) 
+        :param return_alignments: if True, a matrix representing the alignments between the 
+        characters
+        and each decoder output step will be returned for each spectrogram
+        :return: a list of N melspectrograms as numpy arrays of shape (80, Mi), where Mi is the 
+        sequence length of spectrogram i, and possibly the alignments.
+        """
+        if not self._low_mem:
+            # Usual inference mode: load the model on the first request and keep it loaded.
+            if not self.is_loaded():
+                self.load()
+            specs, alignments = self._model.my_synthesize(embeddings, texts)
+        else:
+            # Low memory inference mode: load the model upon every request. The model has to be 
+            # loaded in a separate process to be able to release GPU memory (a simple workaround 
+            # to tensorflow's intricacies)
+            specs, alignments = Pool(1).starmap(Synthesizer._one_shot_synthesize_spectrograms, 
+                                                [(self.checkpoint_fpath, embeddings, texts)])[0]
 
-    if return_alignments:
+        return (specs, alignments) if return_alignments else specs
+
+    @staticmethod
+    def _one_shot_synthesize_spectrograms(checkpoint_fpath, embeddings, texts):
+        # Load the model and forward the inputs
+        tf.reset_default_graph()
+        model = Tacotron2(checkpoint_fpath, hparams)
+        specs, alignments = model.my_synthesize(embeddings, texts)
+
+        # Detach the outputs (not doing so will cause the process to hang)
+        specs, alignments = [spec.copy() for spec in specs], alignments.copy()
+
+        # Close cuda for this process
+        model.session.close()
+        numba.cuda.select_device(0)
+        numba.cuda.close()
+
         return specs, alignments
-    else:
-        return specs
 
-def load_preprocess_wav(fpath):
-    wav = librosa.load(fpath, hparams.sample_rate)[0]
-    if hparams.rescale:
-        wav = wav / np.abs(wav).max() * hparams.rescaling_max
-    return wav
+    @staticmethod
+    def load_preprocess_wav(fpath):
+        """
+        Loads and preprocesses an audio file under the same conditions the audio files were used to
+        train the synthesizer. 
+        """
+        wav = librosa.load(fpath, hparams.sample_rate)[0]
+        if hparams.rescale:
+            wav = wav / np.abs(wav).max() * hparams.rescaling_max
+        return wav
 
-def make_spectrogram(fpath_or_wav: Union[str, Path, np.ndarray]):
-    if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
-        wav = load_preprocess_wav(fpath_or_wav)
-    else: 
-        wav = fpath_or_wav
+    @staticmethod
+    def make_spectrogram(fpath_or_wav: Union[str, Path, np.ndarray]):
+        """
+        Creates a mel spectrogram from an audio file in the same manner as the mel spectrograms that 
+        were fed to the synthesizer when training.
+        """
+        if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
+            wav = Synthesizer.load_preprocess_wav(fpath_or_wav)
+        else:
+            wav = fpath_or_wav
+
+        mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
+        return mel_spectrogram
 
-    mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
-    return mel_spectrogram
-
-def griffin_lim(mel):
-    return audio.inv_mel_spectrogram(mel, hparams)
-
-
+    @staticmethod
+    def griffin_lim(mel):
+        """
+        Inverts a mel spectrogram using Griffin-Lim. The mel spectrogram is expected to have been built
+        with the same parameters present in hparams.py.
+        """
+        return audio.inv_mel_spectrogram(mel, hparams)
+
diff --git a/synthesizer/synthesize.py b/synthesizer/synthesize.py
@@ -1,4 +1,4 @@
-from synthesizer.synthesizer import Synthesizer
+from synthesizer.tacotron2 import Tacotron2
 from synthesizer.hparams import hparams_debug_string
 from synthesizer.infolog import log
 import tensorflow as tf
@@ -18,8 +18,7 @@ def run_eval(args, checkpoint_path, output_dir, hparams, sentences):
     os.makedirs(os.path.join(log_dir, "plots"), exist_ok=True)
 
     log(hparams_debug_string())
-    synth = Synthesizer()
-    synth.load(checkpoint_path, hparams)
+    synth = Tacotron2(checkpoint_path, hparams)
 
     #Set inputs batch wise
     sentences = [sentences[i: i+hparams.tacotron_synthesis_batch_size] for i 
@@ -44,10 +43,9 @@ def run_synthesis(in_dir, out_dir, model_dir, hparams):
     print(hparams_debug_string())
 
     # Load the model in memory
-    synth = Synthesizer()
     weights_dir = os.path.join(model_dir, "taco_pretrained")
     checkpoint_fpath = tf.train.get_checkpoint_state(weights_dir).model_checkpoint_path
-    synth.load(checkpoint_fpath, hparams, gta=True)
+    synth = Tacotron2(checkpoint_fpath, hparams, gta=True)
 
     # Load the metadata
     with open(metadata_filename, encoding="utf-8") as f:

diff --git a/synthesizer/synthesizer.py → synthesizer/tacotron2.py b/synthesizer/synthesizer.py → synthesizer/tacotron2.py
@@ -1,16 +1,15 @@
 from synthesizer.utils.text import text_to_sequence
-from synthesizer import audio
 from synthesizer.infolog import log
 from synthesizer.models import create_model
 from synthesizer.utils import plot
+from synthesizer import audio
 import tensorflow as tf
 import numpy as np
-import wave
 import os
 
 
-class Synthesizer:
-    def load(self, checkpoint_path, hparams, gta=False, model_name="Tacotron"):
+class Tacotron2:
+    def __init__(self, checkpoint_path, hparams, gta=False, model_name="Tacotron"):
         log("Constructing model: %s" % model_name)
         #Force the batch size to be known in order to use attention masking in batch synthesis
         inputs = tf.placeholder(tf.int32, (None, None), name="inputs")
@@ -65,7 +64,7 @@ def load(self, checkpoint_path, hparams, gta=False, model_name="Tacotron"):
 
     def my_synthesize(self, speaker_embeds, texts):
         """
-        Lighter synthesis function that directly returns the mel spectrogram.
+        Lighter synthesis function that directly returns the mel spectrograms.
         """
 
         # Prepare the input
-Original file line number
+Diff line change
@@ Expand Up / @@ -12,3 +12,4 @@ Unidecode @@
     inflect
     PyQt5
     multiprocess
+    numba