From be09c5a13c190fbc03f5dda92b982b33ab26ae5a Mon Sep 17 00:00:00 2001
From: Rayhane Mama <34689728+Rayhane-mamah@users.noreply.github.com>
Date: Tue, 6 Mar 2018 22:46:34 +0100
Subject: [PATCH] audio params update

---
 tacotron/hparams.py     | 16 +++++++++-------
 tacotron/utils/audio.py | 25 ++++++++++++++++---------
 2 files changed, 25 insertions(+), 16 deletions(-)

diff --git a/tacotron/hparams.py b/tacotron/hparams.py
index 7aa4c00d..6dcb9bbc 100644
--- a/tacotron/hparams.py
+++ b/tacotron/hparams.py
@@ -9,15 +9,17 @@
 	cleaners='english_cleaners',
 
 	#Audio
-	num_mels=80,
+	num_mels=80, 
 	num_freq=1025,
-	sample_rate=24000,
-	frame_length_ms=50,
-	frame_shift_ms=12.5,
+	sample_rate=22050, #22050 Hz (corresponding to ljspeech dataset)
+	frame_length_ms= 50,
+	frame_shift_ms= 12.5,
+	lfilter=True, #whether to use preemphasis
 	preemphasis=0.97,
 	min_level_db=-100,
 	ref_level_db=20,
-	cmu_dict=False,
+	fmin=125,
+	fmax=7600,
 
 	#Model
 	outputs_per_step = 1, #number of frames to generate at each decoding step
@@ -33,10 +35,10 @@
 	postnet_num_layers=5, #number of postnet convolutional layers
 	postnet_kernel_size=(5, ), #size of postnet convolution filters for each layer
 	postnet_channels=512, #number of postnet convolution filters for each layer
-	max_iters=808, #Max decoder steps during inference (feel free to change it)
+	max_iters=810, #Max decoder steps during inference (feel free to change it)
 
 	#Training
-	batch_size = 16, #number of training samples on each training steps
+	batch_size = 32, #number of training samples on each training steps
 	reg_weight = 10e-6, #regularization weight (for l2 regularization)
 	decay_learning_rate = True, #boolean, determines if the learning rate will follow an exponential decay
 	decay_steps = 50000, #starting point for learning rate decay (and determines the decay slope)
diff --git a/tacotron/utils/audio.py b/tacotron/utils/audio.py
index 0839f0ac..52309a71 100644
--- a/tacotron/utils/audio.py
+++ b/tacotron/utils/audio.py
@@ -20,12 +20,18 @@ def inv_preemphasis(x):
 	return signal.lfilter([1], [1, -hparams.preemphasis], x)
 
 def spectrogram(y):
-	D = _stft(preemphasis(y))
+	if hparams.lfilter:
+		D = _stft(preemphasis(y))
+	else:
+		D = _stft(y)
 	S = _amp_to_db(np.abs(D)) - hparams.ref_level_db
 	return _normalize(S)
 
 def melspectrogram(y):
-	D = _stft(preemphasis(y))
+	if hparams.lfilter:
+		D = _stft(preemphasis(y))
+	else:
+		D = _stft(y)
 	S = _amp_to_db(_linear_to_mel(np.abs(D)))
 	return _normalize(S)
 
@@ -39,18 +45,18 @@ def find_endpoint(wav, threshhold_db=-40, min_silence_sec=0.8):
 	return len(wav)
 
 def _stft(y):
-	n_fft, hop_length, win_lenght = _stft_params()
-	return librosa.stft(y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_lenght)
+	n_fft, hop_length, win_length = _stft_params()
+	return librosa.stft(y=y, n_fft=n_fft, hop_length=hop_length, win_length=win_length)
 
 def istft(y):
-	_, hop_length, win_lenght = _stft_params()
-	return librosa.istft(y=y, hop_length=hop_length, win_lenght=win_lenght)
+	_, hop_length, win_length = _stft_params()
+	return librosa.istft(y=y, hop_length=hop_length, win_length=win_length)
 
 def _stft_params():
 	n_fft = (hparams.num_freq - 1) * 2
 	hop_length = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate)
-	win_lenght = int(hparams.frame_length_ms / 1000 * hparams.sample_rate)
-	return n_fft, hop_length, win_lenght
+	win_length = int(hparams.frame_length_ms / 1000 * hparams.sample_rate)
+	return n_fft, hop_length, win_length
 
 # Conversions
 
@@ -64,7 +70,8 @@ def _linear_to_mel(spectogram):
 
 def _build_mel_basis():
 	n_fft = (hparams.num_freq - 1) * 2
-	return librosa.filters.mel(hparams.sample_rate, n_fft, n_mels=hparams.num_mels)
+	return librosa.filters.mel(hparams.sample_rate, n_fft, n_mels=hparams.num_mels,
+							   fmin=hparams.fmin, fmax=hparams.fmax,)
 
 def _amp_to_db(x):
 	return 20 * np.log10(np.maximum(1e-5, x)) + 0.01