Skip to content

Commit

Permalink
OO style inference for the synthesizer, experimental support for low …
Browse files Browse the repository at this point in the history
…memory GPUs
  • Loading branch information
CorentinJ committed Jun 25, 2019
1 parent b9a072e commit 8229b1d
Show file tree
Hide file tree
Showing 8 changed files with 169 additions and 97 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ SV2TTS is a three-stage deep learning framework that allows to create a numerica
|[1712.05884](https://arxiv.org/pdf/1712.05884.pdf) | Tacotron 2 (synthesizer) | Natural TTS Synthesis by Conditioning Wavenet on Mel Spectrogram Predictions | [Rayhane-mamah/Tacotron-2](https://github.com/Rayhane-mamah/Tacotron-2)
|[1710.10467](https://arxiv.org/pdf/1710.10467.pdf) | GE2E (encoder)| Generalized End-To-End Loss for Speaker Verification | This repo |

## News
**25/06/19:** Experimental support for low-memory GPUs (~2gb) added for the synthesizer. Pass `--low_mem` to `demo_cli.py` or `demo_toolbox.py` to enable it. It adds a big overhead, so it's not recommended if you have enough VRAM.


## Quick start
Expand Down
13 changes: 7 additions & 6 deletions demo_cli.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from encoder.params_model import model_embedding_size as speaker_embedding_size
from utils.argutils import print_args
from synthesizer import inference as synthesizer
from synthesizer.inference import Synthesizer
from encoder import inference as encoder
from vocoder import inference as vocoder
from pathlib import Path
Expand Down Expand Up @@ -28,6 +28,9 @@
parser.add_argument("-v", "--voc_model_fpath", type=Path,
default="vocoder/saved_models/pretrained/pretrained.pt",
help="Path to a saved vocoder")
parser.add_argument("--low_mem", action="store_true", help=\
"If True, the memory used by the synthesizer will be freed after each use. Adds large "
"overhead but allows to save some GPU memory for lower-end GPUs.")
parser.add_argument("--no_sound", action="store_true", help=\
"If True, audio won't be played.")
args = parser.parse_args()
Expand Down Expand Up @@ -56,12 +59,10 @@


## Load the models one by one.
print("Loading the encoder, the synthesizer and the vocoder. This should take a few seconds. "
"The synthesizer will output a lot of stuff. Tensorflow is like that.")
print("Preparing the encoder, the synthesizer and the vocoder...")
encoder.load_model(args.enc_model_fpath)
synthesizer.load_model(args.syn_model_dir.joinpath("taco_pretrained"))
synthesizer = Synthesizer(args.syn_model_dir.joinpath("taco_pretrained"), low_mem=args.low_mem)
vocoder.load_model(args.voc_model_fpath)
print("\nAll models succesfully loaded!\n")


## Run a test
Expand All @@ -87,7 +88,7 @@
# illustrate that
embeds = [embed, np.zeros(speaker_embedding_size)]
texts = ["test 1", "test 2"]
print("\tTesting the synthesizer...")
print("\tTesting the synthesizer... (loading the model will output a lot of text)")
mels = synthesizer.synthesize_spectrograms(texts, embeds)

# The vocoder synthesizes one waveform at a time, but it's more efficient for long ones. We
Expand Down
3 changes: 3 additions & 0 deletions demo_toolbox.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@
help="Directory containing saved synthesizer models")
parser.add_argument("-v", "--voc_models_dir", type=Path, default="vocoder/saved_models",
help="Directory containing saved vocoder models")
parser.add_argument("--low_mem", action="store_true", help=\
"If True, the memory used by the synthesizer will be freed after each use. Adds large "
"overhead but allows to save some GPU memory for lower-end GPUs.")
args = parser.parse_args()

# Launch the toolbox
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,4 @@ Unidecode
inflect
PyQt5
multiprocess
numba
172 changes: 120 additions & 52 deletions synthesizer/inference.py
Original file line number Diff line number Diff line change
@@ -1,69 +1,137 @@
from synthesizer.tacotron2 import Tacotron2
from synthesizer.hparams import hparams
from synthesizer.synthesizer import Synthesizer
from multiprocess.pool import Pool # You're free to use either one
#from multiprocessing import Pool #
from synthesizer import audio
from pathlib import Path
from typing import Union, List
import tensorflow as tf
import numpy as np
import numba.cuda
import librosa

_model = None # type: Synthesizer
sample_rate = hparams.sample_rate

# TODO: allow for custom hparams throughout this module?

def load_model(checkpoints_dir: Path):
global _model
class Synthesizer:
sample_rate = hparams.sample_rate
hparams = hparams

tf.reset_default_graph()
_model = Synthesizer()
checkpoint_fpath = tf.train.get_checkpoint_state(checkpoints_dir).model_checkpoint_path
_model.load(checkpoint_fpath, hparams)
def __init__(self, checkpoints_dir: Path, verbose=True, low_mem=False):
"""
Creates a synthesizer ready for inference. The actual model isn't loaded in memory until
needed or until load() is called.
:param checkpoints_dir: path to the directory containing the checkpoint file as well as the
weight files (.data, .index and .meta files)
:param verbose: if False, only tensorflow's output will be printed TODO: suppress them too
:param low_mem: if True, the model will be loaded in a separate process and its resources
will be released after each usage. Adds a large overhead, only recommended if your GPU
memory is low (<= 2gb)
"""
self.verbose = verbose
self._low_mem = low_mem

# Prepare the model
self._model = None # type: Tacotron2
checkpoint_state = tf.train.get_checkpoint_state(checkpoints_dir)
if checkpoint_state is None:
raise Exception("Could not find any synthesizer weights under %s" % checkpoints_dir)
self.checkpoint_fpath = checkpoint_state.model_checkpoint_path
if verbose:
model_name = checkpoints_dir.parent.name.replace("logs-", "")
step = int(self.checkpoint_fpath[self.checkpoint_fpath.rfind('-') + 1:])
print("Found synthesizer \"%s\" trained to step %d" % (model_name, step))

def is_loaded(self):
"""
Whether the model is loaded in GPU memory.
"""
return self._model is not None

model_name = checkpoints_dir.parent.name.replace("logs-", "")
step = int(checkpoint_fpath[checkpoint_fpath.rfind('-') + 1:])
print("Loaded synthesizer \"%s\" trained to step %d" % (model_name, step))

def is_loaded():
return _model is not None
def load(self):
"""
Effectively loads the model to GPU memory given the weights file that was passed in the
constructor.
"""
if self._low_mem:
raise Exception("Cannot load the synthesizer permanently in low mem mode")
tf.reset_default_graph()
self._model = Tacotron2(self.checkpoint_fpath, hparams)

def synthesize_spectrograms(self, texts: List[str],
embeddings: Union[np.ndarray, List[np.ndarray]],
return_alignments=False):
"""
Synthesizes mel spectrograms from texts and speaker embeddings.
def synthesize_spectrograms(texts: List[str], embeddings: np.ndarray, return_alignments=False):
"""
Synthesizes mel spectrograms from texts and speaker embeddings.
:param texts: a list of N text prompts to be synthesized
:param embeddings: a numpy array of (N, 256) speaker embeddings
:param return_alignments: if True, a matrix representing the alignments between the characters
and each decoder output step will be returned for each spectrogram
:return: a list of N melspectrograms as numpy arrays of shape (80, M), and possibly the
alignments.
"""
if not is_loaded():
raise Exception("Load a model first")

specs, alignments = _model.my_synthesize(embeddings, texts)
:param texts: a list of N text prompts to be synthesized
:param embeddings: a numpy array or list of speaker embeddings of shape (N, 256)
:param return_alignments: if True, a matrix representing the alignments between the
characters
and each decoder output step will be returned for each spectrogram
:return: a list of N melspectrograms as numpy arrays of shape (80, Mi), where Mi is the
sequence length of spectrogram i, and possibly the alignments.
"""
if not self._low_mem:
# Usual inference mode: load the model on the first request and keep it loaded.
if not self.is_loaded():
self.load()
specs, alignments = self._model.my_synthesize(embeddings, texts)
else:
# Low memory inference mode: load the model upon every request. The model has to be
# loaded in a separate process to be able to release GPU memory (a simple workaround
# to tensorflow's intricacies)
specs, alignments = Pool(1).starmap(Synthesizer._one_shot_synthesize_spectrograms,
[(self.checkpoint_fpath, embeddings, texts)])[0]

if return_alignments:
return (specs, alignments) if return_alignments else specs

@staticmethod
def _one_shot_synthesize_spectrograms(checkpoint_fpath, embeddings, texts):
# Load the model and forward the inputs
tf.reset_default_graph()
model = Tacotron2(checkpoint_fpath, hparams)
specs, alignments = model.my_synthesize(embeddings, texts)

# Detach the outputs (not doing so will cause the process to hang)
specs, alignments = [spec.copy() for spec in specs], alignments.copy()

# Close cuda for this process
model.session.close()
numba.cuda.select_device(0)
numba.cuda.close()

return specs, alignments
else:
return specs

def load_preprocess_wav(fpath):
wav = librosa.load(fpath, hparams.sample_rate)[0]
if hparams.rescale:
wav = wav / np.abs(wav).max() * hparams.rescaling_max
return wav
@staticmethod
def load_preprocess_wav(fpath):
"""
Loads and preprocesses an audio file under the same conditions the audio files were used to
train the synthesizer.
"""
wav = librosa.load(fpath, hparams.sample_rate)[0]
if hparams.rescale:
wav = wav / np.abs(wav).max() * hparams.rescaling_max
return wav

def make_spectrogram(fpath_or_wav: Union[str, Path, np.ndarray]):
if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
wav = load_preprocess_wav(fpath_or_wav)
else:
wav = fpath_or_wav
@staticmethod
def make_spectrogram(fpath_or_wav: Union[str, Path, np.ndarray]):
"""
Creates a mel spectrogram from an audio file in the same manner as the mel spectrograms that
were fed to the synthesizer when training.
"""
if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
wav = Synthesizer.load_preprocess_wav(fpath_or_wav)
else:
wav = fpath_or_wav

mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
return mel_spectrogram

mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
return mel_spectrogram

def griffin_lim(mel):
return audio.inv_mel_spectrogram(mel, hparams)


@staticmethod
def griffin_lim(mel):
"""
Inverts a mel spectrogram using Griffin-Lim. The mel spectrogram is expected to have been built
with the same parameters present in hparams.py.
"""
return audio.inv_mel_spectrogram(mel, hparams)

8 changes: 3 additions & 5 deletions synthesizer/synthesize.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from synthesizer.synthesizer import Synthesizer
from synthesizer.tacotron2 import Tacotron2
from synthesizer.hparams import hparams_debug_string
from synthesizer.infolog import log
import tensorflow as tf
Expand All @@ -18,8 +18,7 @@ def run_eval(args, checkpoint_path, output_dir, hparams, sentences):
os.makedirs(os.path.join(log_dir, "plots"), exist_ok=True)

log(hparams_debug_string())
synth = Synthesizer()
synth.load(checkpoint_path, hparams)
synth = Tacotron2(checkpoint_path, hparams)

#Set inputs batch wise
sentences = [sentences[i: i+hparams.tacotron_synthesis_batch_size] for i
Expand All @@ -44,10 +43,9 @@ def run_synthesis(in_dir, out_dir, model_dir, hparams):
print(hparams_debug_string())

# Load the model in memory
synth = Synthesizer()
weights_dir = os.path.join(model_dir, "taco_pretrained")
checkpoint_fpath = tf.train.get_checkpoint_state(weights_dir).model_checkpoint_path
synth.load(checkpoint_fpath, hparams, gta=True)
synth = Tacotron2(checkpoint_fpath, hparams, gta=True)

# Load the metadata
with open(metadata_filename, encoding="utf-8") as f:
Expand Down
9 changes: 4 additions & 5 deletions synthesizer/synthesizer.py → synthesizer/tacotron2.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,15 @@
from synthesizer.utils.text import text_to_sequence
from synthesizer import audio
from synthesizer.infolog import log
from synthesizer.models import create_model
from synthesizer.utils import plot
from synthesizer import audio
import tensorflow as tf
import numpy as np
import wave
import os


class Synthesizer:
def load(self, checkpoint_path, hparams, gta=False, model_name="Tacotron"):
class Tacotron2:
def __init__(self, checkpoint_path, hparams, gta=False, model_name="Tacotron"):
log("Constructing model: %s" % model_name)
#Force the batch size to be known in order to use attention masking in batch synthesis
inputs = tf.placeholder(tf.int32, (None, None), name="inputs")
Expand Down Expand Up @@ -65,7 +64,7 @@ def load(self, checkpoint_path, hparams, gta=False, model_name="Tacotron"):

def my_synthesize(self, speaker_embeds, texts):
"""
Lighter synthesis function that directly returns the mel spectrogram.
Lighter synthesis function that directly returns the mel spectrograms.
"""

# Prepare the input
Expand Down
Loading

0 comments on commit 8229b1d

Please sign in to comment.