diff --git a/egs/cvss_c/hubert_voc1/conf/hifigan_hubert_duration.v1.yaml b/egs/cvss_c/hubert_voc1/conf/hifigan_hubert_duration.v1.yaml index dcc33482..7f92477a 100644 --- a/egs/cvss_c/hubert_voc1/conf/hifigan_hubert_duration.v1.yaml +++ b/egs/cvss_c/hubert_voc1/conf/hifigan_hubert_duration.v1.yaml @@ -31,7 +31,7 @@ generator_params: num_embs: 500 kernel_size: 7 # Kernel size of initial and final conv layers. upsample_scales: [10, 8, 2, 2] # Upsampling scales. - upsample_kernal_sizes: [20, 16, 4, 4] # Kernel size for upsampling layers. + upsample_kernel_sizes: [20, 16, 4, 4] # Kernel size for upsampling layers. resblock_kernel_sizes: [3, 7, 11] # Kernel size for residual blocks. resblock_dilations: # Dilations for residual blocks. - [1, 3, 5] @@ -64,7 +64,7 @@ discriminator_params: scale_discriminator_params: in_channels: 1 # Number of input channels. out_channels: 1 # Number of output channels. - kernel_sizes: [15, 41, 5, 3] # List of kernal sizes. + kernel_sizes: [15, 41, 5, 3] # List of kernel sizes. channels: 128 # Initial number of channels. max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers. max_groups: 16 # Maximum number of groups in downsampling conv layers. @@ -78,7 +78,7 @@ discriminator_params: period_discriminator_params: in_channels: 1 # Number of input channels. out_channels: 1 # Number of output channels. - kernel_sizes: [5, 3] # List of kernal sizes. + kernel_sizes: [5, 3] # List of kernel sizes. channels: 32 # Initial number of channels. downsample_scales: [3, 3, 3, 3, 1] # Downsampling scales. max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers. diff --git a/egs/cvss_c/hubert_voc1/conf/hifigan_hubert_duration_24k.v1.yaml b/egs/cvss_c/hubert_voc1/conf/hifigan_hubert_duration_24k.v1.yaml index f51d59f4..eca11c5f 100644 --- a/egs/cvss_c/hubert_voc1/conf/hifigan_hubert_duration_24k.v1.yaml +++ b/egs/cvss_c/hubert_voc1/conf/hifigan_hubert_duration_24k.v1.yaml @@ -32,7 +32,7 @@ generator_params: num_spk_embs: 0 # Do not consider speaker embedding for single spk kernel_size: 7 # Kernel size of initial and final conv layers. upsample_scales: [12, 10, 2, 2] # Upsampling scales. - upsample_kernal_sizes: [24, 20, 4, 4] # Kernel size for upsampling layers. + upsample_kernel_sizes: [24, 20, 4, 4] # Kernel size for upsampling layers. resblock_kernel_sizes: [3, 7, 11] # Kernel size for residual blocks. resblock_dilations: # Dilations for residual blocks. - [1, 3, 5] @@ -69,7 +69,7 @@ discriminator_params: scale_discriminator_params: in_channels: 1 # Number of input channels. out_channels: 1 # Number of output channels. - kernel_sizes: [15, 41, 5, 3] # List of kernal sizes. + kernel_sizes: [15, 41, 5, 3] # List of kernel sizes. channels: 128 # Initial number of channels. max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers. max_groups: 16 # Maximum number of groups in downsampling conv layers. @@ -83,7 +83,7 @@ discriminator_params: period_discriminator_params: in_channels: 1 # Number of input channels. out_channels: 1 # Number of output channels. - kernel_sizes: [5, 3] # List of kernal sizes. + kernel_sizes: [5, 3] # List of kernel sizes. channels: 32 # Initial number of channels. downsample_scales: [3, 3, 3, 3, 1] # Downsampling scales. max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers. diff --git a/egs/cvss_c/hubert_voc1/run.sh b/egs/cvss_c/hubert_voc1/run.sh index 0b7921f4..ec9756bf 100755 --- a/egs/cvss_c/hubert_voc1/run.sh +++ b/egs/cvss_c/hubert_voc1/run.sh @@ -14,7 +14,7 @@ n_gpus=1 # number of gpus in training n_jobs=16 # number of parallel jobs in feature extraction # NOTE(kan-bayashi): renamed to conf to avoid conflict in parse_options.sh -conf=conf/hifigan_hubert.v1.yaml +conf=conf/hifigan_hubert_duration.v1.yaml # directory path setting db_root=/usr0/home/jiatongs/data/cvss/es_en-c # direcotry including wavfiles (MODIFY BY YOURSELF) diff --git a/egs/vctk/hubert_voc1/conf/hifigan_hubert.v1.yaml b/egs/vctk/hubert_voc1/conf/hifigan_hubert.v1.yaml index 32229a5b..e354e8e8 100644 --- a/egs/vctk/hubert_voc1/conf/hifigan_hubert.v1.yaml +++ b/egs/vctk/hubert_voc1/conf/hifigan_hubert.v1.yaml @@ -34,7 +34,7 @@ generator_params: concat_spk_emb: false kernel_size: 7 # Kernel size of initial and final conv layers. upsample_scales: [10, 8, 2, 2] # Upsampling scales. - upsample_kernal_sizes: [20, 16, 4, 4] # Kernel size for upsampling layers. + upsample_kernel_sizes: [20, 16, 4, 4] # Kernel size for upsampling layers. resblock_kernel_sizes: [3, 7, 11] # Kernel size for residual blocks. resblock_dilations: # Dilations for residual blocks. - [1, 3, 5] @@ -61,7 +61,7 @@ discriminator_params: scale_discriminator_params: in_channels: 1 # Number of input channels. out_channels: 1 # Number of output channels. - kernel_sizes: [15, 41, 5, 3] # List of kernal sizes. + kernel_sizes: [15, 41, 5, 3] # List of kernel sizes. channels: 128 # Initial number of channels. max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers. max_groups: 16 # Maximum number of groups in downsampling conv layers. @@ -75,7 +75,7 @@ discriminator_params: period_discriminator_params: in_channels: 1 # Number of input channels. out_channels: 1 # Number of output channels. - kernel_sizes: [5, 3] # List of kernal sizes. + kernel_sizes: [5, 3] # List of kernel sizes. channels: 32 # Initial number of channels. downsample_scales: [3, 3, 3, 3, 1] # Downsampling scales. max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers. diff --git a/egs/vctk/hubert_voc1/conf/hifigan_hubert_24k.v1.yaml b/egs/vctk/hubert_voc1/conf/hifigan_hubert_24k.v1.yaml index d89c609a..d8e55881 100644 --- a/egs/vctk/hubert_voc1/conf/hifigan_hubert_24k.v1.yaml +++ b/egs/vctk/hubert_voc1/conf/hifigan_hubert_24k.v1.yaml @@ -34,7 +34,7 @@ generator_params: concat_spk_emb: false kernel_size: 7 # Kernel size of initial and final conv layers. upsample_scales: [12, 10, 2, 2] # Upsampling scales. - upsample_kernal_sizes: [24, 20, 4, 4] # Kernel size for upsampling layers. + upsample_kernel_sizes: [24, 20, 4, 4] # Kernel size for upsampling layers. resblock_kernel_sizes: [3, 7, 11] # Kernel size for residual blocks. resblock_dilations: # Dilations for residual blocks. - [1, 3, 5] @@ -61,7 +61,7 @@ discriminator_params: scale_discriminator_params: in_channels: 1 # Number of input channels. out_channels: 1 # Number of output channels. - kernel_sizes: [15, 41, 5, 3] # List of kernal sizes. + kernel_sizes: [15, 41, 5, 3] # List of kernel sizes. channels: 128 # Initial number of channels. max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers. max_groups: 16 # Maximum number of groups in downsampling conv layers. @@ -75,7 +75,7 @@ discriminator_params: period_discriminator_params: in_channels: 1 # Number of input channels. out_channels: 1 # Number of output channels. - kernel_sizes: [5, 3] # List of kernal sizes. + kernel_sizes: [5, 3] # List of kernel sizes. channels: 32 # Initial number of channels. downsample_scales: [3, 3, 3, 3, 1] # Downsampling scales. max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers. diff --git a/parallel_wavegan/models/hifigan.py b/parallel_wavegan/models/hifigan.py index 8323fcdb..e42eea98 100644 --- a/parallel_wavegan/models/hifigan.py +++ b/parallel_wavegan/models/hifigan.py @@ -792,7 +792,7 @@ def __init__( concat_spk_emb=False, kernel_size=7, upsample_scales=(8, 8, 2, 2), - upsample_kernal_sizes=(16, 16, 4, 4), + upsample_kernel_sizes=(16, 16, 4, 4), resblock_kernel_sizes=(3, 7, 11), resblock_dilations=[(1, 3, 5), (1, 3, 5), (1, 3, 5)], use_additional_convs=True, @@ -813,8 +813,8 @@ def __init__( concat_spk_emb (bool): whether to concat speaker embedding to the input kernel_size (int): Kernel size of initial and final conv layer. upsample_scales (list): List of upsampling scales. - upsample_kernal_sizes (list): List of kernal sizes for upsampling layers. - resblock_kernal_sizes (list): List of kernal sizes for residual blocks. + upsample_kernel_sizes (list): List of kernel sizes for upsampling layers. + resblock_kernel_sizes (list): List of kernel sizes for residual blocks. resblock_dilations (list): List of dilation list for residual blocks. use_additional_convs (bool): Whether to use additional conv layers in residual blocks. bias (bool): Whether to add bias parameter in convolution layers. @@ -843,11 +843,11 @@ def __init__( # check hyperparameters are valid assert kernel_size % 2 == 1, "Kernal size must be odd number." - assert len(upsample_scales) == len(upsample_kernal_sizes) + assert len(upsample_scales) == len(upsample_kernel_sizes) assert len(resblock_dilations) == len(resblock_kernel_sizes) # define modules - self.num_upsamples = len(upsample_kernal_sizes) + self.num_upsamples = len(upsample_kernel_sizes) self.num_blocks = len(resblock_kernel_sizes) self.input_conv = torch.nn.Conv1d( in_channels, @@ -858,7 +858,7 @@ def __init__( ) self.upsamples = torch.nn.ModuleList() self.blocks = torch.nn.ModuleList() - for i in range(len(upsample_kernal_sizes)): + for i in range(len(upsample_kernel_sizes)): self.upsamples += [ torch.nn.Sequential( getattr(torch.nn, nonlinear_activation)( @@ -867,9 +867,9 @@ def __init__( torch.nn.ConvTranspose1d( channels // (2**i), channels // (2 ** (i + 1)), - upsample_kernal_sizes[i], + upsample_kernel_sizes[i], upsample_scales[i], - padding=(upsample_kernal_sizes[i] - upsample_scales[i]) // 2, + padding=(upsample_kernel_sizes[i] - upsample_scales[i]) // 2, ), ) ] @@ -1024,7 +1024,7 @@ def __init__( duration_dropout_rate=0.5, kernel_size=7, upsample_scales=(8, 8, 2, 2), - upsample_kernal_sizes=(16, 16, 4, 4), + upsample_kernel_sizes=(16, 16, 4, 4), resblock_kernel_sizes=(3, 7, 11), resblock_dilations=[(1, 3, 5), (1, 3, 5), (1, 3, 5)], use_additional_convs=True, @@ -1050,8 +1050,8 @@ def __init__( duration_dropout_rate (float): duration predictor dropout rate kernel_size (int): Kernel size of initial and final conv layer. upsample_scales (list): List of upsampling scales. - upsample_kernal_sizes (list): List of kernal sizes for upsampling layers. - resblock_kernal_sizes (list): List of kernal sizes for residual blocks. + upsample_kernel_sizes (list): List of kernel sizes for upsampling layers. + resblock_kernel_sizes (list): List of kernel sizes for residual blocks. resblock_dilations (list): List of dilation list for residual blocks. use_additional_convs (bool): Whether to use additional conv layers in residual blocks. bias (bool): Whether to add bias parameter in convolution layers. @@ -1071,7 +1071,7 @@ def __init__( concat_spk_emb=concat_spk_emb, kernel_size=kernel_size, upsample_scales=upsample_scales, - upsample_kernal_sizes=upsample_kernal_sizes, + upsample_kernel_sizes=upsample_kernel_sizes, resblock_kernel_sizes=resblock_kernel_sizes, resblock_dilations=resblock_dilations, use_additional_convs=use_additional_convs, @@ -1156,7 +1156,7 @@ def inference(self, c, g=None, ds=None, normalize_before=False): c = c[:, 0:1] if ds is None: - c = self.synthesis(c.transpose(1, 0).unsqueeze(0)) + c, _ = self.synthesis(c.transpose(1, 0).unsqueeze(0)) else: c, _ = self.forward(c.transpose(1, 0).unsqueeze(0), ds.unsqueeze(0)) return c.squeeze(0).transpose(1, 0)