diff --git a/tacotron/datasets/feeder.py b/tacotron/datasets/feeder.py index add1c1a4..4d4561f8 100644 --- a/tacotron/datasets/feeder.py +++ b/tacotron/datasets/feeder.py @@ -100,7 +100,6 @@ def _prepare_batch(batch, outputs_per_step): inputs = _prepare_inputs([x[0] for x in batch]) input_lengths = np.asarray([len(x[0]) for x in batch], dtype=np.int32) mel_targets = _prepare_targets([x[1] for x in batch], outputs_per_step) - #linear_targets = _prepare_targets([x[2] for x in batch], outputs_per_step) return (inputs, input_lengths, mel_targets) def _prepare_inputs(inputs): diff --git a/tacotron/datasets/preprocessor.py b/tacotron/datasets/preprocessor.py index d7478b92..a52d3081 100644 --- a/tacotron/datasets/preprocessor.py +++ b/tacotron/datasets/preprocessor.py @@ -54,12 +54,9 @@ def _process_utterance(out_dir, index, wav_path, text): # Load the audio as numpy array wav = audio.load_wav(wav_path) - # Compute the linear-scale spectrogram from the wav to calculate n_frames - spectrogram = audio.spectrogram(wav).astype(np.float32) - n_frames = spectrogram.shape[1] - # Compute the mel scale spectrogram from the wav mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) + n_frames = mel_spectrogram.shape[1] # Write the spectrogram to disk mel_filename = 'ljspeech-mel-{:05d}.npy'.format(index) diff --git a/tacotron/griffin_lim_synthesis_example.ipynb b/tacotron/griffin_lim_synthesis_example.ipynb new file mode 100644 index 00000000..1a0526ee --- /dev/null +++ b/tacotron/griffin_lim_synthesis_example.ipynb @@ -0,0 +1,68 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(660, 80)" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import numpy as np\n", + "from utils.audio import *\n", + "import os\n", + "\n", + "mel_folder = 'logs-Tacotron'\n", + "mel_file = 'ljspeech-mel-prediction-step-1400.npy'\n", + "\n", + "mel_file = os.path.join(mel_folder, mel_file) \n", + "mel_spectro = np.load(mel_file)\n", + "mel_spectro.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "wav = inv_mel_spectrogram(mel_spectro.T)\n", + "save_wav(wav, 'wav_out/test.wav')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tacotron/hparams.py b/tacotron/hparams.py index 6dcb9bbc..42f4c471 100644 --- a/tacotron/hparams.py +++ b/tacotron/hparams.py @@ -20,9 +20,11 @@ ref_level_db=20, fmin=125, fmax=7600, + power=1.3, + griffin_lim_iters=60, #Model - outputs_per_step = 1, #number of frames to generate at each decoding step + outputs_per_step = 5, #number of frames to generate at each decoding step (speeds up computation and allows for higher batch size) embedding_dim = 512, #dimension of embedding space enc_conv_num_layers=3, #number of encoder convolutional layers enc_conv_kernel_size=(5, ), #size of encoder convolution filters for each layer @@ -35,19 +37,19 @@ postnet_num_layers=5, #number of postnet convolutional layers postnet_kernel_size=(5, ), #size of postnet convolution filters for each layer postnet_channels=512, #number of postnet convolution filters for each layer - max_iters=810, #Max decoder steps during inference (feel free to change it) + max_iters=175, #Max decoder steps during inference (feel free to change it) #Training batch_size = 32, #number of training samples on each training steps - reg_weight = 10e-6, #regularization weight (for l2 regularization) + reg_weight = 10**(-6), #regularization weight (for l2 regularization) decay_learning_rate = True, #boolean, determines if the learning rate will follow an exponential decay decay_steps = 50000, #starting point for learning rate decay (and determines the decay slope) - decay_rate = 0.97, #learning rate decay rate - initial_learning_rate = 10e-3, #starting learning rate - final_learning_rate = 10e-5, #minimal learning rate + decay_rate = 0.4, #learning rate decay rate + initial_learning_rate = 10**(-3), #starting learning rate + final_learning_rate = 10**(-5), #minimal learning rate adam_beta1 = 0.9, #AdamOptimizer beta1 parameter adam_beta2 = 0.999, #AdamOptimizer beta2 parameter - adam_epsilon = 10e-6, #AdamOptimizer beta3 parameter + adam_epsilon = 10**(-6), #AdamOptimizer beta3 parameter zoneout_rate=0.1, #zoneout rate for all LSTM cells in the network dropout_rate=0.5, #dropout rate for all convolutional layers + prenet diff --git a/tacotron/models/attention.py b/tacotron/models/attention.py index 18f51407..abc8c706 100644 --- a/tacotron/models/attention.py +++ b/tacotron/models/attention.py @@ -1,7 +1,7 @@ """Attention file for location based attention (compatible with tensorflow attention wrapper)""" import tensorflow as tf -from tensorflow.contrib.seq2seq.python.ops.attention_wrapper import _BaseMonotonicAttentionMechanism +from tensorflow.contrib.seq2seq.python.ops.attention_wrapper import _BaseAttentionMechanism from tensorflow.python.ops import nn_ops from tensorflow.python.layers import core as layers_core from tensorflow.python.ops import variable_scope @@ -37,6 +37,7 @@ def _location_sensitive_score(W_query, attention_weights, W_keys): # location features [batch_size, max_time, filters] f = tf.layers.conv1d(attention_weights, filters=32, kernel_size=(31, ), padding='same', + kernel_initializer=tf.contrib.layers.xavier_initializer(), name='location_features') # Projected location features [batch_size, max_time, attention_dim] @@ -49,12 +50,13 @@ def _location_sensitive_score(W_query, attention_weights, W_keys): scope='W_filter') v_a = tf.get_variable( - 'v_a', shape=[num_units], dtype=tf.float32) + 'v_a', shape=[num_units], dtype=tf.float32, + initializer=tf.contrib.layers.xavier_initializer()) return tf.reduce_sum(v_a * tf.tanh(W_keys + tf.expand_dims(W_query, axis=1) + W_fil), axis=2) -class LocationSensitiveAttention(_BaseMonotonicAttentionMechanism): +class LocationSensitiveAttention(_BaseAttentionMechanism): """Impelements Bahdanau-style (cumulative) scoring function. Usually referred to as "hybrid" attention (content-based + location-based) This attention is described in: diff --git a/tacotron/models/dynamic_decoder.py b/tacotron/models/dynamic_decoder.py index 25d9c74d..06646fd5 100644 --- a/tacotron/models/dynamic_decoder.py +++ b/tacotron/models/dynamic_decoder.py @@ -224,7 +224,7 @@ def _maybe_copy_state(new, cur): stop_token_loss = res[5] #Average error over decoding steps - avg_stop_loss = stop_token_loss / steps + #avg_stop_loss = stop_token_loss / steps final_outputs = nest.map_structure( lambda ta: ta.stack(), final_outputs_ta) @@ -232,4 +232,4 @@ def _maybe_copy_state(new, cur): final_outputs = nest.map_structure( _transpose_batch_time, final_outputs) - return final_outputs, final_state, avg_stop_loss \ No newline at end of file + return final_outputs, final_state, stop_token_loss \ No newline at end of file diff --git a/tacotron/models/modules.py b/tacotron/models/modules.py index 7487916d..31ce28c0 100644 --- a/tacotron/models/modules.py +++ b/tacotron/models/modules.py @@ -13,7 +13,8 @@ def conv1d(inputs, kernel_size, channels, activation, is_training, scope): filters=channels, kernel_size=kernel_size, activation=activation, - padding='same') + padding='same', + kernel_initializer=tf.contrib.layers.xavier_initializer()) batched = tf.layers.batch_normalization(conv1d_output, training=is_training) return tf.layers.dropout(batched, rate=drop_rate, training=is_training, name='dropout_{}'.format(scope)) @@ -91,7 +92,9 @@ def prenet(inputs, is_training, layer_sizes=[128, 128], scope=None): with tf.variable_scope(scope): for i, size in enumerate(layer_sizes): - dense = tf.layers.dense(x, units=size, activation=tf.nn.relu, name='dense_{}'.format(i + 1)) + dense = tf.layers.dense(x, units=size, activation=tf.nn.relu, + kernel_initializer=tf.contrib.layers.xavier_initializer(), + name='dense_{}'.format(i + 1)) #The paper discussed introducing diversity in generation at inference time #by using a dropout of 0.5 only in prenet layers. x = tf.layers.dropout(dense, rate=drop_rate, training=is_training, @@ -115,7 +118,7 @@ def stop_token_projection(x, shape=1, activation=lambda _: _, weights_name='stop inference time for stop token prediction """ - st_W = tf.get_variable(weights_name, shape=[x.shape[-1], 1], dtype=tf.float32, initializer=tf.truncated_normal_initializer()) + st_W = tf.get_variable(weights_name, shape=[x.shape[-1], 1], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer()) st_b = tf.get_variable(bias_name, shape=[1], dtype=tf.float32, initializer=tf.zeros_initializer()) output = activation(tf.add(tf.matmul(x, st_W), st_b)) diff --git a/tacotron/models/rnn_wrappers.py b/tacotron/models/rnn_wrappers.py index 3722f753..c7eab6e1 100644 --- a/tacotron/models/rnn_wrappers.py +++ b/tacotron/models/rnn_wrappers.py @@ -97,35 +97,3 @@ def call(self, inputs, state): def zero_state(self, batch_size, dtype): return self._cell.zero_state(batch_size, dtype) - - -# class LinearProjectionWrapper(RNNCell): -# """Operator adding an output projection to the given cell. -# This wrapper will perform a linear transformation with specified activation function.(Default to None) -# """ -# def __init__(self, cell, projection_dim, activation=None): -# super(LinearProjectionWrapper, self).__init__() -# self._cell = cell -# self._projection_dim = projection_dim -# self._activation = activation - -# @property -# def state_size(self): -# return self._cell.state_size - -# @property -# def output_size(self): -# return self._projection_dim - -# def zero_state(self, batch_size, dtype): -# with ops.name_scope(type(self).__name__ + "ZeroState", values=[batch_size]): -# return self._cell.zero_state(batch_size, dtype) - -# def call(self, inputs, state): -# """Run the cell and output projection on inputs, starting from state.""" -# output, res_state = self._cell(inputs, state) -# projected = projection(output, self._projection_dim) -# if self._activation: -# projected = self._activation(projected) - -# return projected, res_state \ No newline at end of file diff --git a/tacotron/models/tacotron.py b/tacotron/models/tacotron.py index 83c3fc54..bcf8435c 100644 --- a/tacotron/models/tacotron.py +++ b/tacotron/models/tacotron.py @@ -4,7 +4,7 @@ from .helpers import TacoTrainingHelper, TacoTestHelper from .modules import * from models.zoneout_LSTM import ZoneoutLSTMCell -from tensorflow.contrib.seq2seq import AttentionWrapper +from tensorflow.contrib.seq2seq import AttentionWrapper, LuongAttention from .rnn_wrappers import * from tensorflow.contrib.rnn import MultiRNNCell, OutputProjectionWrapper from .attention import LocationSensitiveAttention @@ -53,9 +53,9 @@ def initialize(self, inputs, input_lengths, mel_targets=None, gta=False): #Attention attention_cell = AttentionWrapper( - DecoderPrenetWrapper(ZoneoutLSTMCell(hp.attention_dim, is_training, - zoneout_factor_cell=hp.zoneout_rate, - zoneout_factor_output=hp.zoneout_rate), is_training), + DecoderPrenetWrapper(ZoneoutLSTMCell(hp.attention_dim, is_training, #Separate LSTM for attention mechanism + zoneout_factor_cell=hp.zoneout_rate, #based on original tacotron architecture + zoneout_factor_output=hp.zoneout_rate), is_training), LocationSensitiveAttention(hp.attention_dim, encoder_outputs), alignment_history=True, output_attention=False, @@ -72,7 +72,7 @@ def initialize(self, inputs, input_lengths, mel_targets=None, gta=False): #Concat LSTM output with context vector concat_decoder_cell = ConcatLSTMOutputAndAttentionWrapper(decoder_cell) - #Projection to mel-spectrogram dimension (linear transformation) + #Projection to mel-spectrogram dimension (times number of outputs per step) (linear transformation) output_cell = OutputProjectionWrapper(concat_decoder_cell, hp.num_mels * hp.outputs_per_step) #Define the helper for our decoder @@ -81,7 +81,7 @@ def initialize(self, inputs, input_lengths, mel_targets=None, gta=False): else: self.helper = TacoTestHelper(batch_size, hp.num_mels, hp.outputs_per_step) - #We"ll only limit decoder time steps during inference (consult hparams.py to modify the value) + #We'll only limit decoder time steps during inference (consult hparams.py to modify the value) max_iterations = None if is_training else hp.max_iters #initial decoder state @@ -90,15 +90,19 @@ def initialize(self, inputs, input_lengths, mel_targets=None, gta=False): #Decode (decoder_output, _), final_decoder_state, self.stop_token_loss = dynamic_decode( CustomDecoder(output_cell, self.helper, decoder_init_state), - impute_finished=True, #Cut out padded parts + impute_finished=True, #Cut out padded parts (enabled) maximum_iterations=max_iterations) + # Reshape outputs to be one output per entry + decoder_output = tf.reshape(decoder_output, [batch_size, -1, hp.num_mels]) + #Compute residual using post-net residual = postnet(decoder_output, is_training, kernel_size=hp.postnet_kernel_size, channels=hp.postnet_channels) #Project residual to same dimension as mel spectrogram - projected_residual = projection(residual, shape=hp.num_mels, + projected_residual = projection(residual, + shape=hp.num_mels, scope='residual_projection') #Compute the mel spectrogram @@ -136,7 +140,8 @@ def add_loss(self): # Get all trainable variables all_vars = tf.trainable_variables() # Compute the regularization term - regularization = tf.add_n([tf.nn.l2_loss(v) for v in all_vars]) * hp.reg_weight + regularization = tf.add_n([tf.nn.l2_loss(v) for v in all_vars + if not('bias' in v.name or 'Bias' in v.name)]) * hp.reg_weight # Compute final loss term self.before_loss = before @@ -173,15 +178,16 @@ def add_optimizer(self, global_step): global_step=global_step) def _learning_rate_decay(self, init_lr, global_step): - # Exponential decay starting after 50,000 iterations + # Exponential decay starting after 50,000 iterations (ignored for now) # We won't drop learning rate below 10e-5 hp = self._hparams step = tf.cast(global_step + 1, dtype=tf.float32) - if tf.greater(step, self.decay_steps) == True: - lr = tf.train.exponential_decay(init_lr, - global_step - decay_steps + 1, - self.decay_steps, - self.decay_rate, - name='exponential_decay') - return max(hp.final_learning_rate, lr) - return init_lr \ No newline at end of file + #Testing decaying rate since beginning (as the model seems to train faster than expected) + #if tf.greater(step, self.decay_steps) == True: + lr = tf.train.exponential_decay(init_lr, + global_step - self.decay_steps + 1, + self.decay_steps, + self.decay_rate, + name='exponential_decay') + return tf.maximum(hp.final_learning_rate, lr) + #return init_lr \ No newline at end of file diff --git a/tacotron/models/zoneout_LSTM.py b/tacotron/models/zoneout_LSTM.py index a32d1ff3..24e41369 100644 --- a/tacotron/models/zoneout_LSTM.py +++ b/tacotron/models/zoneout_LSTM.py @@ -22,7 +22,8 @@ class ZoneoutLSTMCell(RNNCell): def __init__(self, num_units, is_training, input_size=None, use_peepholes=False, cell_clip=None, - initializer=orthogonal_initializer(), + #initializer=orthogonal_initializer(), + initializer=tf.contrib.layers.xavier_initializer(), num_proj=None, proj_clip=None, ext_proj=None, forget_bias=1.0, state_is_tuple=True, diff --git a/tacotron/train.py b/tacotron/train.py index 7abdbe31..a1fffbad 100644 --- a/tacotron/train.py +++ b/tacotron/train.py @@ -24,6 +24,10 @@ def add_stats(model): tf.summary.scalar('regularization_loss', model.regularization_loss) tf.summary.scalar('stop_token_loss', model.stop_token_loss) tf.summary.scalar('loss', model.loss) + tf.summary.scalar('learning_rate', model.learning_rate) #control learning rate decay speed + gradient_norms = [tf.norm(grad) for grad in model.gradients] + tf.summary.histogram('gradient_norm', gradient_norms) + tf.summary.scalar('max_gradient_norm', tf.reduce_max(gradient_norms)) #visualize gradients (in case of explosion) return tf.summary.merge_all() def time_string(): @@ -33,6 +37,8 @@ def train(log_dir, args): save_dir = os.path.join(log_dir, 'pretrained/') checkpoint_path = os.path.join(save_dir, 'model.ckpt') input_path = os.path.join(args.base_dir, args.input) + plot_dir = os.path.join(log_dir, 'plots') + os.makedirs(plot_dir, exist_ok=True) log('Checkpoint path: {}'.format(checkpoint_path)) log('Loading training data from: {}'.format(input_path)) log('Using model: {}'.format(args.model)) @@ -122,18 +128,25 @@ def train(log_dir, args): saver.save(sess, checkpoint_path, global_step=step) # Unlike the original tacotron, we won't save audio # because we yet have to use wavenet as vocoder - log('Saving alignement..') - input_seq, prediction, alignment = sess.run([model.inputs[0], - model.mel_outputs[0], - model.alignments[0], - ]) + log('Saving alignement and Mel-Spectrograms..') + input_seq, prediction, alignment, target = sess.run([model.inputs[0], + model.mel_outputs[0], + model.alignments[0], + model.mel_targets[0], + ]) #save predicted spectrogram to disk (for plot and manual evaluation purposes) mel_filename = 'ljspeech-mel-prediction-step-{}.npy'.format(step) np.save(os.path.join(log_dir, mel_filename), prediction, allow_pickle=False) - #save alignment plot to disk (evaluation purposes) - plot.plot_alignment(alignment, os.path.join(log_dir, 'step-{}-align.png'.format(step)), + #save alignment plot to disk (control purposes) + plot.plot_alignment(alignment, os.path.join(plot_dir, 'step-{}-align.png'.format(step)), info='{}, {}, step={}, loss={:.5f}'.format(args.model, time_string(), step, loss)) + #save real mel-spectrogram plot to disk (control purposes) + plot.plot_spectrogram(target, os.path.join(plot_dir, 'step-{}-real-mel-spectrogram.png'.format(step)), + info='{}, {}, step={}, Real'.format(args.model, time_string(), step, loss)) + #save predicted mel-spectrogram plot to disk (control purposes) + plot.plot_spectrogram(prediction, os.path.join(plot_dir, 'step-{}-pred-mel-spectrogram.png'.format(step)), + info='{}, {}, step={}, loss={:.5}'.format(args.model, time_string(), step, loss)) log('Input at step {}: {}'.format(step, sequence_to_text(input_seq))) except Exception as e: diff --git a/tacotron/utils/audio.py b/tacotron/utils/audio.py index 52309a71..d5533c5d 100644 --- a/tacotron/utils/audio.py +++ b/tacotron/utils/audio.py @@ -19,38 +19,40 @@ def preemphasis(x): def inv_preemphasis(x): return signal.lfilter([1], [1, -hparams.preemphasis], x) -def spectrogram(y): - if hparams.lfilter: - D = _stft(preemphasis(y)) - else: - D = _stft(y) - S = _amp_to_db(np.abs(D)) - hparams.ref_level_db - return _normalize(S) - def melspectrogram(y): if hparams.lfilter: D = _stft(preemphasis(y)) else: D = _stft(y) - S = _amp_to_db(_linear_to_mel(np.abs(D))) + S = _amp_to_db(_linear_to_mel(np.abs(D))) - hparams.ref_level_db return _normalize(S) -def find_endpoint(wav, threshhold_db=-40, min_silence_sec=0.8): - window_length = int(hparams.sample_rate * min_silence_sec) - hop_length = int(window_length / 4) - threshhold = _db_to_amp(threshhold_db) - for x in range(hop_length, len(wav) - window_length, hop_length): - if np.max(wav[x: x+window_length]) < threshhold: - return x + hop_length - return len(wav) +def inv_mel_spectrogram(mel_spectrogram): + '''Converts mel spectrogram to waveform using librosa''' + S = _mel_to_linear(_db_to_amp(_denormalize(mel_spectrogram) + hparams.ref_level_db)) # Convert back to linear + if hparams.lfilter: + return inv_preemphasis(_griffin_lim(S ** hparams.power)) # Reconstruct phase + return _griffin_lim(S ** hparams.power) + +def _griffin_lim(S): + '''librosa implementation of Griffin-Lim + Based on https://github.com/librosa/librosa/issues/434 + ''' + angles = np.exp(2j * np.pi * np.random.rand(*S.shape)) + S_complex = np.abs(S).astype(np.complex) + y = _istft(S_complex * angles) + for i in range(hparams.griffin_lim_iters): + angles = np.exp(1j * np.angle(_stft(y))) + y = _istft(S_complex * angles) + return y def _stft(y): n_fft, hop_length, win_length = _stft_params() return librosa.stft(y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length) -def istft(y): +def _istft(y): _, hop_length, win_length = _stft_params() - return librosa.istft(y=y, hop_length=hop_length, win_length=win_length) + return librosa.istft(y, hop_length=hop_length, win_length=win_length) def _stft_params(): n_fft = (hparams.num_freq - 1) * 2 @@ -58,8 +60,8 @@ def _stft_params(): win_length = int(hparams.frame_length_ms / 1000 * hparams.sample_rate) return n_fft, hop_length, win_length -# Conversions +# Conversions _mel_basis = None def _linear_to_mel(spectogram): @@ -68,6 +70,12 @@ def _linear_to_mel(spectogram): _mel_basis = _build_mel_basis() return np.dot(_mel_basis, spectogram) +def _mel_to_linear(mel_spectrogram): + global _mel_basis + if _mel_basis is None: + _mel_basis = _build_mel_basis() + return np.dot(_mel_basis.T, mel_spectrogram) + def _build_mel_basis(): n_fft = (hparams.num_freq - 1) * 2 return librosa.filters.mel(hparams.sample_rate, n_fft, n_mels=hparams.num_mels, @@ -76,7 +84,7 @@ def _build_mel_basis(): def _amp_to_db(x): return 20 * np.log10(np.maximum(1e-5, x)) + 0.01 -def _dp_to_amp(x): +def _db_to_amp(x): return np.power(10.0, (x - 0.01) * 0.05) def _normalize(S): diff --git a/tacotron/utils/plot.py b/tacotron/utils/plot.py index 1570d124..13ea360d 100644 --- a/tacotron/utils/plot.py +++ b/tacotron/utils/plot.py @@ -1,20 +1,34 @@ import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt +import numpy as np def plot_alignment(alignment, path, info=None): - fig, ax = plt.subplots() - im = ax.imshow( - alignment, - aspect='auto', - origin='lower', - interpolation='none') - fig.colorbar(im, ax=ax) - xlabel = 'Decoder timestep' - if info is not None: - xlabel += '\n\n' + info - plt.xlabel(xlabel) - plt.ylabel('Encoder timestep') - plt.tight_layout() - plt.savefig(path, format='png') + fig, ax = plt.subplots() + im = ax.imshow( + alignment, + aspect='auto', + origin='lower', + interpolation='none') + fig.colorbar(im, ax=ax) + xlabel = 'Decoder timestep' + if info is not None: + xlabel += '\n\n' + info + plt.xlabel(xlabel) + plt.ylabel('Encoder timestep') + plt.tight_layout() + plt.savefig(path, format='png') + + +def plot_spectrogram(spectrogram, path, info=None): + plt.figure() + plt.imshow(np.rot90(spectrogram)) + plt.colorbar(shrink=0.5, orientation='horizontal') + plt.ylabel('mels') + xlabel = 'frames' + if info is not None: + xlabel += '\n' + info + plt.xlabel(xlabel) + plt.tight_layout() + plt.savefig(path, format='png')