-
-
Notifications
You must be signed in to change notification settings - Fork 344
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #386 from ftshijt/master
- Loading branch information
Showing
7 changed files
with
526 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ====== | ||
# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...> | ||
# e.g. | ||
# run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB | ||
# | ||
# Options: | ||
# --time <time>: Limit the maximum time to execute. | ||
# --mem <mem>: Limit the maximum memory usage. | ||
# -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs. | ||
# --num-threads <ngpu>: Specify the number of CPU core. | ||
# --gpu <ngpu>: Specify the number of GPU devices. | ||
# --config: Change the configuration file from default. | ||
# | ||
# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs. | ||
# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name, | ||
# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively. | ||
# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example. | ||
# | ||
# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend. | ||
# These options are mapping to specific options for each backend and | ||
# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default. | ||
# If jobs failed, your configuration might be wrong for your environment. | ||
# | ||
# | ||
# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl: | ||
# "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html | ||
# =========================================================~ | ||
|
||
|
||
# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh" | ||
cmd_backend="local" | ||
|
||
# Local machine, without any Job scheduling system | ||
if [ "${cmd_backend}" = local ]; then | ||
|
||
# The other usage | ||
export train_cmd="utils/run.pl" | ||
# Used for "*_train.py": "--gpu" is appended optionally by run.sh | ||
export cuda_cmd="utils/run.pl" | ||
# Used for "*_recog.py" | ||
export decode_cmd="utils/run.pl" | ||
|
||
# Local machine, without any Job scheduling system | ||
elif [ "${cmd_backend}" = stdout ]; then | ||
|
||
# The other usage | ||
export train_cmd="utils/stdout.pl" | ||
# Used for "*_train.py": "--gpu" is appended optionally by run.sh | ||
export cuda_cmd="utils/stdout.pl" | ||
# Used for "*_recog.py" | ||
export decode_cmd="utils/stdout.pl" | ||
|
||
# "qsub" (SGE, Torque, PBS, etc.) | ||
elif [ "${cmd_backend}" = sge ]; then | ||
# The default setting is written in conf/queue.conf. | ||
# You must change "-q g.q" for the "queue" for your environment. | ||
# To know the "queue" names, type "qhost -q" | ||
# Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler. | ||
|
||
export train_cmd="utils/queue.pl" | ||
export cuda_cmd="utils/queue.pl" | ||
export decode_cmd="utils/queue.pl" | ||
|
||
# "sbatch" (Slurm) | ||
elif [ "${cmd_backend}" = slurm ]; then | ||
# The default setting is written in conf/slurm.conf. | ||
# You must change "-p cpu" and "-p gpu" for the "partion" for your environment. | ||
# To know the "partion" names, type "sinfo". | ||
# You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*" | ||
# The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}". | ||
|
||
export train_cmd="utils/slurm.pl" | ||
export cuda_cmd="utils/slurm.pl" | ||
export decode_cmd="utils/slurm.pl" | ||
|
||
elif [ "${cmd_backend}" = ssh ]; then | ||
# You have to create ".queue/machines" to specify the host to execute jobs. | ||
# e.g. .queue/machines | ||
# host1 | ||
# host2 | ||
# host3 | ||
# Assuming you can login them without any password, i.e. You have to set ssh keys. | ||
|
||
export train_cmd="utils/ssh.pl" | ||
export cuda_cmd="utils/ssh.pl" | ||
export decode_cmd="utils/ssh.pl" | ||
|
||
else | ||
echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2 | ||
return 1 | ||
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,178 @@ | ||
# This is the configuration file for LJSpeech dataset. | ||
# This configuration is based on HiFiGAN V1, which is | ||
# an official configuration. But I found that the optimizer | ||
# setting does not work well with my implementation. | ||
# So I changed optimizer settings as follows: | ||
# - AdamW -> Adam | ||
# - betas: [0.8, 0.99] -> betas: [0.5, 0.9] | ||
# - Scheduler: ExponentialLR -> MultiStepLR | ||
|
||
########################################################### | ||
# FEATURE EXTRACTION SETTING # | ||
########################################################### | ||
sampling_rate: 16000 # Sampling rate. | ||
fft_size: 512 # FFT size. | ||
hop_size: 160 # Hop size. | ||
win_length: 400 # Window length. | ||
# If set to null, it will be the same as fft_size. | ||
window: "hann" # Window function. | ||
num_mels: 80 # Number of mel basis. | ||
fmin: 80 # Minimum freq in mel basis calculation. | ||
fmax: 7600 # Maximum frequency in mel basis calculation. | ||
global_gain_scale: 1.0 # Will be multiplied to all of waveform. | ||
trim_silence: false # Whether to trim the start and end of silence. | ||
trim_threshold_in_db: 20 # Need to tune carefully if the recording is not good. | ||
trim_frame_size: 500 # Frame size in trimming. | ||
trim_hop_size: 160 # Hop size in trimming. | ||
format: "hdf5" # Feature file format. "npy" or "hdf5" is supported. | ||
|
||
########################################################### | ||
# GENERATOR NETWORK ARCHITECTURE SETTING # | ||
########################################################### | ||
generator_type: HiFiGANGenerator | ||
generator_params: | ||
in_channels: 80 # Number of input channels. | ||
out_channels: 1 # Number of output channels. | ||
channels: 512 # Number of initial channels. | ||
kernel_size: 7 # Kernel size of initial and final conv layers. | ||
upsample_scales: [8, 5, 2, 2] # Upsampling scales. | ||
upsample_kernel_sizes: [16, 10, 4, 4] # Kernel size for upsampling layers. | ||
resblock_kernel_sizes: [3, 7, 11] # Kernel size for residual blocks. | ||
resblock_dilations: # Dilations for residual blocks. | ||
- [1, 3, 5] | ||
- [1, 3, 5] | ||
- [1, 3, 5] | ||
use_additional_convs: true # Whether to use additional conv layer in residual blocks. | ||
bias: true # Whether to use bias parameter in conv. | ||
nonlinear_activation: "LeakyReLU" # Nonlinear activation type. | ||
nonlinear_activation_params: # Nonlinear activation paramters. | ||
negative_slope: 0.1 | ||
use_weight_norm: true # Whether to apply weight normalization. | ||
|
||
########################################################### | ||
# DISCRIMINATOR NETWORK ARCHITECTURE SETTING # | ||
########################################################### | ||
discriminator_type: HiFiGANMultiScaleMultiPeriodDiscriminator | ||
discriminator_params: | ||
scales: 3 # Number of multi-scale discriminator. | ||
scale_downsample_pooling: "AvgPool1d" # Pooling operation for scale discriminator. | ||
scale_downsample_pooling_params: | ||
kernel_size: 4 # Pooling kernel size. | ||
stride: 2 # Pooling stride. | ||
padding: 2 # Padding size. | ||
scale_discriminator_params: | ||
in_channels: 1 # Number of input channels. | ||
out_channels: 1 # Number of output channels. | ||
kernel_sizes: [15, 41, 5, 3] # List of kernel sizes. | ||
channels: 128 # Initial number of channels. | ||
max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers. | ||
max_groups: 16 # Maximum number of groups in downsampling conv layers. | ||
bias: true | ||
downsample_scales: [4, 4, 4, 4, 1] # Downsampling scales. | ||
nonlinear_activation: "LeakyReLU" # Nonlinear activation. | ||
nonlinear_activation_params: | ||
negative_slope: 0.1 | ||
follow_official_norm: true # Whether to follow the official norm setting. | ||
periods: [2, 3, 5, 7, 11] # List of period for multi-period discriminator. | ||
period_discriminator_params: | ||
in_channels: 1 # Number of input channels. | ||
out_channels: 1 # Number of output channels. | ||
kernel_sizes: [5, 3] # List of kernel sizes. | ||
channels: 32 # Initial number of channels. | ||
downsample_scales: [3, 3, 3, 3, 1] # Downsampling scales. | ||
max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers. | ||
bias: true # Whether to use bias parameter in conv layer." | ||
nonlinear_activation: "LeakyReLU" # Nonlinear activation. | ||
nonlinear_activation_params: # Nonlinear activation paramters. | ||
negative_slope: 0.1 | ||
use_weight_norm: true # Whether to apply weight normalization. | ||
use_spectral_norm: false # Whether to apply spectral normalization. | ||
|
||
########################################################### | ||
# STFT LOSS SETTING # | ||
########################################################### | ||
use_stft_loss: false # Whether to use multi-resolution STFT loss. | ||
use_mel_loss: true # Whether to use Mel-spectrogram loss. | ||
mel_loss_params: | ||
fs: 16000 | ||
fft_size: 512 | ||
hop_size: 160 | ||
win_length: 400 | ||
window: "hann" | ||
num_mels: 80 | ||
fmin: 0 | ||
fmax: 11025 | ||
log_base: null | ||
generator_adv_loss_params: | ||
average_by_discriminators: false # Whether to average loss by #discriminators. | ||
discriminator_adv_loss_params: | ||
average_by_discriminators: false # Whether to average loss by #discriminators. | ||
use_feat_match_loss: true | ||
feat_match_loss_params: | ||
average_by_discriminators: false # Whether to average loss by #discriminators. | ||
average_by_layers: false # Whether to average loss by #layers in each discriminator. | ||
include_final_outputs: false # Whether to include final outputs in feat match loss calculation. | ||
|
||
########################################################### | ||
# ADVERSARIAL LOSS SETTING # | ||
########################################################### | ||
lambda_aux: 45.0 # Loss balancing coefficient for STFT loss. | ||
lambda_adv: 1.0 # Loss balancing coefficient for adversarial loss. | ||
lambda_feat_match: 2.0 # Loss balancing coefficient for feat match loss.. | ||
|
||
########################################################### | ||
# DATA LOADER SETTING # | ||
########################################################### | ||
batch_size: 16 # Batch size. | ||
batch_max_steps: 8192 # Length of each audio in batch. Make sure dividable by hop_size. | ||
pin_memory: true # Whether to pin memory in Pytorch DataLoader. | ||
num_workers: 2 # Number of workers in Pytorch DataLoader. | ||
remove_short_samples: false # Whether to remove samples the length of which are less than batch_max_steps. | ||
allow_cache: true # Whether to allow cache in dataset. If true, it requires cpu memory. | ||
|
||
########################################################### | ||
# OPTIMIZER & SCHEDULER SETTING # | ||
########################################################### | ||
generator_optimizer_type: Adam | ||
generator_optimizer_params: | ||
lr: 2.0e-4 | ||
betas: [0.5, 0.9] | ||
weight_decay: 0.0 | ||
generator_scheduler_type: MultiStepLR | ||
generator_scheduler_params: | ||
gamma: 0.5 | ||
milestones: | ||
- 200000 | ||
- 400000 | ||
- 600000 | ||
- 800000 | ||
generator_grad_norm: -1 | ||
discriminator_optimizer_type: Adam | ||
discriminator_optimizer_params: | ||
lr: 2.0e-4 | ||
betas: [0.5, 0.9] | ||
weight_decay: 0.0 | ||
discriminator_scheduler_type: MultiStepLR | ||
discriminator_scheduler_params: | ||
gamma: 0.5 | ||
milestones: | ||
- 200000 | ||
- 400000 | ||
- 600000 | ||
- 800000 | ||
discriminator_grad_norm: -1 | ||
|
||
########################################################### | ||
# INTERVAL SETTING # | ||
########################################################### | ||
generator_train_start_steps: 1 # Number of steps to start to train discriminator. | ||
discriminator_train_start_steps: 0 # Number of steps to start to train discriminator. | ||
train_max_steps: 2500000 # Number of training steps. | ||
save_interval_steps: 10000 # Interval steps to save checkpoint. | ||
eval_interval_steps: 1000 # Interval steps to evaluate the network. | ||
log_interval_steps: 100 # Interval steps to record the training log. | ||
|
||
########################################################### | ||
# OTHER SETTING # | ||
########################################################### | ||
num_save_intermediate_results: 4 # Number of results to be saved as intermediate results. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
# Default configuration | ||
command sbatch --export=PATH --ntasks-per-node=1 | ||
option time=* --time $0 | ||
option mem=* --mem-per-cpu $0 | ||
option mem=0 # Do not add anything to qsub_opts | ||
option num_threads=* --cpus-per-task $0 --ntasks-per-node=1 | ||
option num_threads=1 --cpus-per-task 1 --ntasks-per-node=1 # Do not add anything to qsub_opts | ||
default gpu=0 | ||
option gpu=0 -p cpu | ||
option gpu=* -p gpu --gres=gpu:$0 | ||
# note: the --max-jobs-run option is supported as a special case | ||
# by slurm.pl and you don't have to handle it in the config file. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
#!/bin/bash | ||
|
||
# Copyright 2022 Jiatong Shi | ||
# MIT License (https://opensource.org/licenses/MIT) | ||
|
||
# shellcheck disable=SC1091 | ||
. ./path.sh || exit 1; | ||
|
||
fs=22050 | ||
|
||
# shellcheck disable=SC1091 | ||
. utils/parse_options.sh || exit 1; | ||
|
||
db_root=$1 | ||
data_dir=$2 | ||
|
||
# check arguments | ||
if [ $# != 2 ]; then | ||
echo "Usage: $0 [Options] <db_root> <data_dir>" | ||
exit 1 | ||
fi | ||
|
||
set -euo pipefail | ||
|
||
for subset in "train" "dev" "test"; do | ||
mkdir -p "${data_dir}/${subset}" | ||
scp="${data_dir}/${subset}/wav.scp" | ||
[ -e "${scp}" ] && rm "${scp}" | ||
find "${db_root}" -follow -name "*.wav" | sort | while read -r filename; do | ||
id=$(basename "${filename}" | sed -e "s/\.[^\.]*$//g") | ||
echo "${id} cat ${filename} | sox -t wav - -c 1 -b 16 -t wav - rate ${fs} |" >> "${scp}" | ||
done | ||
done | ||
|
||
echo "Successfully prepared data." |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
# cuda related | ||
export CUDA_HOME=/usr/local/cuda-10.0 | ||
export LD_LIBRARY_PATH="${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}" | ||
|
||
# path related | ||
export PRJ_ROOT="${PWD}/../../.." | ||
if [ -e "${PRJ_ROOT}/tools/venv/bin/activate" ]; then | ||
# shellcheck disable=SC1090 | ||
. "${PRJ_ROOT}/tools/venv/bin/activate" | ||
fi | ||
|
||
# python related | ||
export OMP_NUM_THREADS=1 | ||
export PYTHONIOENCODING=UTF-8 | ||
export MPL_BACKEND=Agg | ||
|
||
# check installation | ||
if ! command -v parallel-wavegan-train > /dev/null; then | ||
echo "Error: It seems setup is not finished." >&2 | ||
echo "Error: Please setup your environment by following README.md" >&2 | ||
return 1 | ||
fi | ||
if ! command -v jq > /dev/null; then | ||
echo "Error: It seems jq is not installed." >&2 | ||
echo "Error: Please install via \`sudo apt-get install jq\`." >&2 | ||
echo "Error: If you do not have sudo, please download from https://stedolan.github.io/jq/download/." >&2 | ||
return 1 | ||
fi | ||
if ! command -v yq > /dev/null; then | ||
echo "Error: It seems yq is not installed." >&2 | ||
echo "Error: Please install via \`pip install yq\`." >&2 | ||
return 1 | ||
fi |
Oops, something went wrong.