From be09c5a13c190fbc03f5dda92b982b33ab26ae5a Mon Sep 17 00:00:00 2001 From: Rayhane Mama <34689728+Rayhane-mamah@users.noreply.github.com> Date: Tue, 6 Mar 2018 22:46:34 +0100 Subject: [PATCH] audio params update --- tacotron/hparams.py | 16 +++++++++------- tacotron/utils/audio.py | 25 ++++++++++++++++--------- 2 files changed, 25 insertions(+), 16 deletions(-) diff --git a/tacotron/hparams.py b/tacotron/hparams.py index 7aa4c00d..6dcb9bbc 100644 --- a/tacotron/hparams.py +++ b/tacotron/hparams.py @@ -9,15 +9,17 @@ cleaners='english_cleaners', #Audio - num_mels=80, + num_mels=80, num_freq=1025, - sample_rate=24000, - frame_length_ms=50, - frame_shift_ms=12.5, + sample_rate=22050, #22050 Hz (corresponding to ljspeech dataset) + frame_length_ms= 50, + frame_shift_ms= 12.5, + lfilter=True, #whether to use preemphasis preemphasis=0.97, min_level_db=-100, ref_level_db=20, - cmu_dict=False, + fmin=125, + fmax=7600, #Model outputs_per_step = 1, #number of frames to generate at each decoding step @@ -33,10 +35,10 @@ postnet_num_layers=5, #number of postnet convolutional layers postnet_kernel_size=(5, ), #size of postnet convolution filters for each layer postnet_channels=512, #number of postnet convolution filters for each layer - max_iters=808, #Max decoder steps during inference (feel free to change it) + max_iters=810, #Max decoder steps during inference (feel free to change it) #Training - batch_size = 16, #number of training samples on each training steps + batch_size = 32, #number of training samples on each training steps reg_weight = 10e-6, #regularization weight (for l2 regularization) decay_learning_rate = True, #boolean, determines if the learning rate will follow an exponential decay decay_steps = 50000, #starting point for learning rate decay (and determines the decay slope) diff --git a/tacotron/utils/audio.py b/tacotron/utils/audio.py index 0839f0ac..52309a71 100644 --- a/tacotron/utils/audio.py +++ b/tacotron/utils/audio.py @@ -20,12 +20,18 @@ def inv_preemphasis(x): return signal.lfilter([1], [1, -hparams.preemphasis], x) def spectrogram(y): - D = _stft(preemphasis(y)) + if hparams.lfilter: + D = _stft(preemphasis(y)) + else: + D = _stft(y) S = _amp_to_db(np.abs(D)) - hparams.ref_level_db return _normalize(S) def melspectrogram(y): - D = _stft(preemphasis(y)) + if hparams.lfilter: + D = _stft(preemphasis(y)) + else: + D = _stft(y) S = _amp_to_db(_linear_to_mel(np.abs(D))) return _normalize(S) @@ -39,18 +45,18 @@ def find_endpoint(wav, threshhold_db=-40, min_silence_sec=0.8): return len(wav) def _stft(y): - n_fft, hop_length, win_lenght = _stft_params() - return librosa.stft(y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_lenght) + n_fft, hop_length, win_length = _stft_params() + return librosa.stft(y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length) def istft(y): - _, hop_length, win_lenght = _stft_params() - return librosa.istft(y=y, hop_length=hop_length, win_lenght=win_lenght) + _, hop_length, win_length = _stft_params() + return librosa.istft(y=y, hop_length=hop_length, win_length=win_length) def _stft_params(): n_fft = (hparams.num_freq - 1) * 2 hop_length = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate) - win_lenght = int(hparams.frame_length_ms / 1000 * hparams.sample_rate) - return n_fft, hop_length, win_lenght + win_length = int(hparams.frame_length_ms / 1000 * hparams.sample_rate) + return n_fft, hop_length, win_length # Conversions @@ -64,7 +70,8 @@ def _linear_to_mel(spectogram): def _build_mel_basis(): n_fft = (hparams.num_freq - 1) * 2 - return librosa.filters.mel(hparams.sample_rate, n_fft, n_mels=hparams.num_mels) + return librosa.filters.mel(hparams.sample_rate, n_fft, n_mels=hparams.num_mels, + fmin=hparams.fmin, fmax=hparams.fmax,) def _amp_to_db(x): return 20 * np.log10(np.maximum(1e-5, x)) + 0.01