forked from jinhan/tacotron2-vae
-
Notifications
You must be signed in to change notification settings - Fork 0
/
hparams.py
130 lines (109 loc) · 4.06 KB
/
hparams.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import tensorflow as tf
def create_hparams(hparams_string=None, verbose=False):
"""Create model hyperparameters. Parse nondefault from given string."""
hparams = tf.contrib.training.HParams(
################################
# Experiment Parameters #
################################
epochs=300,
iters_per_checkpoint=500,
seed=1234,
dynamic_loss_scaling=True,
fp16_run=False,
distributed_run=False,
dist_backend="nccl",
dist_url="tcp://localhost:54321",
cudnn_enabled=True,
cudnn_benchmark=True,
################################
# Data Parameters #
################################
load_mel_from_disk=False,
training_files='filelists/ms_kor_train.txt',
validation_files='filelists/ms_kor_val.txt',
text_cleaners=['korean_cleaners'], # english_cleaners, korean_cleaners
sort_by_length=False,
################################
# Audio Parameters #
################################
max_wav_value=32768.0,
sampling_rate=16000,
filter_length=1024,
hop_length=256, # number audio of frames between stft colmns, default win_length/4
win_length=1024, # win_length int <= n_ftt: fft window size (frequency domain), defaults to win_length = n_fft
n_mel_channels=80,
mel_fmin=0.0,
mel_fmax=8000.0,
################################
# Model Parameters #
################################
n_symbols = 80, # set 80 if u use korean_cleaners. set 65 if u use english_cleaners
symbols_embedding_dim=512,
# Transcript encoder parameters
encoder_kernel_size = 5,
encoder_n_convolutions = 3,
encoder_embedding_dim = 512,
# Speaker embedding parameters
n_speakers = 1,
speaker_embedding_dim=16,
# ---------------------------------------- #
# emotion
n_emotions = 4, # number of emotion labels
emotion_embedding_dim=16,
# reference encoder
E = 512,
ref_enc_filters = [32, 32, 64, 64, 128, 128],
ref_enc_size = [3, 3],
ref_enc_strides = [2, 2],
ref_enc_pad = [1, 1],
ref_enc_gru_size = 512 // 2,
z_latent_dim = 32,
anneal_function = 'logistic',
anneal_k = 0.0025,
anneal_x0 = 10000,
anneal_upper = 0.2,
anneal_lag = 50000,
# Prosody embedding parameters
prosody_n_convolutions = 6,
prosody_conv_dim_in = [1, 32, 32, 64, 64, 128],
prosody_conv_dim_out = [32, 32, 64, 64, 128, 128],
prosody_conv_kernel = 3,
prosody_conv_stride = 2,
prosody_embedding_dim = 128,
# Decoder parameters
n_frames_per_step=1, # currently only 1 is supported
decoder_rnn_dim=1024,
prenet_dim=256,
max_decoder_steps=1000,
gate_threshold=0.5,
p_attention_dropout=0.1,
p_decoder_dropout=0.1,
# Attention parameters
attention_rnn_dim=1024,
attention_dim=128,
# Location Layer parameters
attention_location_n_filters=32,
attention_location_kernel_size=31,
# Mel-post processing network parameters
postnet_embedding_dim=512,
postnet_kernel_size=5,
postnet_n_convolutions=5,
################################
# Optimization Hyperparameters #
################################
use_saved_learning_rate=False,
learning_rate=1e-3,
weight_decay=1e-6,
grad_clip_thresh=1.0,
batch_size=64,
mask_padding=True # set model's padded outputs to padded values
)
if hparams_string:
tf.logging.info('Parsing command line hparams: %s', hparams_string)
hparams.parse(hparams_string)
if verbose:
tf.logging.info('Final parsed hparams: %s', hparams.values())
return hparams
if __name__=='__main__':
hp = create_hparams(verbose=True)
print(hp.batch_size)