Skip to content

Commit

Permalink
audio params update
Browse files Browse the repository at this point in the history
  • Loading branch information
Rayhane-mamah authored Mar 6, 2018
1 parent fccb535 commit be09c5a
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 16 deletions.
16 changes: 9 additions & 7 deletions tacotron/hparams.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,17 @@
cleaners='english_cleaners',

#Audio
num_mels=80,
num_mels=80,
num_freq=1025,
sample_rate=24000,
frame_length_ms=50,
frame_shift_ms=12.5,
sample_rate=22050, #22050 Hz (corresponding to ljspeech dataset)
frame_length_ms= 50,
frame_shift_ms= 12.5,
lfilter=True, #whether to use preemphasis
preemphasis=0.97,
min_level_db=-100,
ref_level_db=20,
cmu_dict=False,
fmin=125,
fmax=7600,

#Model
outputs_per_step = 1, #number of frames to generate at each decoding step
Expand All @@ -33,10 +35,10 @@
postnet_num_layers=5, #number of postnet convolutional layers
postnet_kernel_size=(5, ), #size of postnet convolution filters for each layer
postnet_channels=512, #number of postnet convolution filters for each layer
max_iters=808, #Max decoder steps during inference (feel free to change it)
max_iters=810, #Max decoder steps during inference (feel free to change it)

#Training
batch_size = 16, #number of training samples on each training steps
batch_size = 32, #number of training samples on each training steps
reg_weight = 10e-6, #regularization weight (for l2 regularization)
decay_learning_rate = True, #boolean, determines if the learning rate will follow an exponential decay
decay_steps = 50000, #starting point for learning rate decay (and determines the decay slope)
Expand Down
25 changes: 16 additions & 9 deletions tacotron/utils/audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,18 @@ def inv_preemphasis(x):
return signal.lfilter([1], [1, -hparams.preemphasis], x)

def spectrogram(y):
D = _stft(preemphasis(y))
if hparams.lfilter:
D = _stft(preemphasis(y))
else:
D = _stft(y)
S = _amp_to_db(np.abs(D)) - hparams.ref_level_db
return _normalize(S)

def melspectrogram(y):
D = _stft(preemphasis(y))
if hparams.lfilter:
D = _stft(preemphasis(y))
else:
D = _stft(y)
S = _amp_to_db(_linear_to_mel(np.abs(D)))
return _normalize(S)

Expand All @@ -39,18 +45,18 @@ def find_endpoint(wav, threshhold_db=-40, min_silence_sec=0.8):
return len(wav)

def _stft(y):
n_fft, hop_length, win_lenght = _stft_params()
return librosa.stft(y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_lenght)
n_fft, hop_length, win_length = _stft_params()
return librosa.stft(y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length)

def istft(y):
_, hop_length, win_lenght = _stft_params()
return librosa.istft(y=y, hop_length=hop_length, win_lenght=win_lenght)
_, hop_length, win_length = _stft_params()
return librosa.istft(y=y, hop_length=hop_length, win_length=win_length)

def _stft_params():
n_fft = (hparams.num_freq - 1) * 2
hop_length = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
win_lenght = int(hparams.frame_length_ms / 1000 * hparams.sample_rate)
return n_fft, hop_length, win_lenght
win_length = int(hparams.frame_length_ms / 1000 * hparams.sample_rate)
return n_fft, hop_length, win_length

# Conversions

Expand All @@ -64,7 +70,8 @@ def _linear_to_mel(spectogram):

def _build_mel_basis():
n_fft = (hparams.num_freq - 1) * 2
return librosa.filters.mel(hparams.sample_rate, n_fft, n_mels=hparams.num_mels)
return librosa.filters.mel(hparams.sample_rate, n_fft, n_mels=hparams.num_mels,
fmin=hparams.fmin, fmax=hparams.fmax,)

def _amp_to_db(x):
return 20 * np.log10(np.maximum(1e-5, x)) + 0.01
Expand Down

0 comments on commit be09c5a

Please sign in to comment.