Skip to content

Commit

Permalink
fix comment
Browse files Browse the repository at this point in the history
  • Loading branch information
lym0302 committed Mar 9, 2023
1 parent 3df69e7 commit 9acc852
Show file tree
Hide file tree
Showing 19 changed files with 302 additions and 132 deletions.
6 changes: 3 additions & 3 deletions examples/opencpop/svs1/conf/default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ model:
# music score related
note_num: 300 # number of note
is_slur_num: 2 # number of slur
stretch: True # whether to stretch before diffusion

# fastspeech2 module
fastspeech2_params:
Expand Down Expand Up @@ -142,15 +143,14 @@ ds_grad_norm: 1
###########################################################
# INTERVAL SETTING #
###########################################################
only_train_diffusion: True # Whether to freeze fastspeech2 parameters when training diffusion
ds_train_start_steps: 160000 # Number of steps to start to train diffusion module.
train_max_steps: 320000 # Number of training steps.
save_interval_steps: 2000 # Interval steps to save checkpoint.
eval_interval_steps: 2000 # Interval steps to evaluate the network.
num_snapshots: 5 # Number of saved models

num_snapshots: 5

###########################################################
# OTHER SETTING #
###########################################################
seed: 10086
find_unused_parameters: True
8 changes: 8 additions & 0 deletions examples/opencpop/svs1/local/preprocess.sh
Original file line number Diff line number Diff line change
Expand Up @@ -64,3 +64,11 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
--phones-dict=dump/phone_id_map.txt \
--speaker-dict=dump/speaker_id_map.txt
fi

if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# Get feature(mel) extremum for diffusion stretch
echo "Get feature(mel) extremum ..."
python3 ${BIN_DIR}/computer_extremum.py \
--metadata=dump/train/norm/metadata.jsonl \
--speech-stretchs=dump/train/speech_stretchs.npy
fi
6 changes: 2 additions & 4 deletions examples/opencpop/svs1/local/synthesize.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
config_path=$1
train_output_path=$2
ckpt_name=$3
#iter=$3
#ckpt_name=snapshot_iter_${iter}.pdz
stage=0
stop_stage=0

Expand All @@ -21,8 +19,8 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
--voc_config=pwgan_opencpop/default.yaml \
--voc_ckpt=pwgan_opencpop/snapshot_iter_100000.pdz \
--voc_stat=pwgan_opencpop/feats_stats.npy \
--test_metadata=test.jsonl \
--output_dir=${train_output_path}/test_${iter} \
--test_metadata=dump/test/norm/metadata.jsonl \
--output_dir=${train_output_path}/test \
--phones_dict=dump/phone_id_map.txt
fi

3 changes: 2 additions & 1 deletion examples/opencpop/svs1/local/train.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,5 @@ python3 ${BIN_DIR}/train.py \
--config=${config_path} \
--output-dir=${train_output_path} \
--ngpu=1 \
--phones-dict=dump/phone_id_map.txt
--phones-dict=dump/phone_id_map.txt \
--speech-stretchs=dump/train/speech_stretchs.npy
13 changes: 6 additions & 7 deletions paddlespeech/t2s/datasets/get_feats.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,18 +12,17 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# Modified from espnet(https://github.com/espnet/espnet)
from typing import List
from typing import Optional
from typing import Union

import librosa
import numpy as np
import pyworld
from scipy.interpolate import interp1d
from typing import List

from typing import Optional
from typing import Union
from typing_extensions import Literal



class LogMelFBank():
def __init__(self,
sr: int=24000,
Expand Down Expand Up @@ -80,7 +79,7 @@ def _stft(self, wav: np.ndarray):

def _spectrogram(self, wav: np.ndarray):
D = self._stft(wav)
return np.abs(D) ** self.power
return np.abs(D)**self.power

def _mel_spectrogram(self, wav: np.ndarray):
S = self._spectrogram(wav)
Expand Down Expand Up @@ -139,7 +138,7 @@ def _calculate_f0(self,
input: np.ndarray,
use_continuous_f0: bool=True,
use_log_f0: bool=True) -> np.ndarray:
input = input.astype(float)
input = input.astype(np.float32)
frame_period = 1000 * self.hop_length / self.sr
f0, timeaxis = pyworld.dio(
input,
Expand Down
2 changes: 1 addition & 1 deletion paddlespeech/t2s/exps/diffsinger/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down
83 changes: 83 additions & 0 deletions paddlespeech/t2s/exps/diffsinger/computer_extremum.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import logging

import jsonlines
import numpy as np
from tqdm import tqdm

from paddlespeech.t2s.datasets.data_table import DataTable


def find_min_max_spec(spec, min_spec, max_spec):
# spec: [T, 80]
for i in range(spec.shape[1]):
min_value = np.min(spec[:, i])
max_value = np.max(spec[:, i])
min_spec[i] = min(min_value, min_spec[i])
max_spec[i] = max(max_value, max_spec[i])

return min_spec, max_spec


def main():
"""Run preprocessing process."""
parser = argparse.ArgumentParser(
description="Normalize dumped raw features (See detail in parallel_wavegan/bin/normalize.py)."
)
parser.add_argument(
"--metadata",
type=str,
required=True,
help="directory including feature files to be normalized. "
"you need to specify either *-scp or rootdir.")

parser.add_argument(
"--speech-stretchs",
type=str,
required=True,
help="min max spec file. only computer on train data")

args = parser.parse_args()

# get dataset
with jsonlines.open(args.metadata, 'r') as reader:
metadata = list(reader)
dataset = DataTable(
metadata, converters={
"speech": np.load,
})
logging.info(f"The number of files = {len(dataset)}.")

n_mel = 80
min_spec = 100 * np.ones(shape=(n_mel), dtype=np.float32)
max_spec = -100 * np.ones(shape=(n_mel), dtype=np.float32)

for item in tqdm(dataset):
spec = item['speech']
min_spec, max_spec = find_min_max_spec(spec, min_spec, max_spec)

print(min_spec)
print(max_spec)

min_max_spec = np.stack([min_spec, max_spec], axis=0)
np.save(
str(args.speech_stretchs),
min_max_spec.astype(np.float32),
allow_pickle=False)


if __name__ == "__main__":
main()
2 changes: 1 addition & 1 deletion paddlespeech/t2s/exps/diffsinger/normalize.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down
2 changes: 1 addition & 1 deletion paddlespeech/t2s/exps/diffsinger/preprocess.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down
30 changes: 20 additions & 10 deletions paddlespeech/t2s/exps/diffsinger/train.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -127,9 +127,21 @@ def train_sp(args, config):
vocab_size = len(phn_id)
print("vocab_size:", vocab_size)

with open(args.speech_stretchs, "r") as f:
spec_min = np.load(args.speech_stretchs)[0]
spec_max = np.load(args.speech_stretchs)[1]
spec_min = paddle.to_tensor(spec_min)
spec_max = paddle.to_tensor(spec_max)
print("min and max spec done!")

odim = config.n_mels
config["model"]["fastspeech2_params"]["spk_num"] = spk_num
model = DiffSinger(idim=vocab_size, odim=odim, **config["model"])
model = DiffSinger(
idim=vocab_size,
odim=odim,
**config["model"],
spec_min=spec_min,
spec_max=spec_max)
model_fs2 = model.fs2
model_ds = model.diffusion
if world_size > 1:
Expand All @@ -143,13 +155,6 @@ def train_sp(args, config):
print("criterions done!")

optimizer_fs2 = build_optimizers(model_fs2, **config["fs2_optimizer"])
# gradient_clip_ds = nn.ClipGradByGlobalNorm(config["ds_grad_norm"])
# optimizer_ds = AdamW(
# learning_rate=config["ds_scheduler_params"]["learning_rate"],
# grad_clip=gradient_clip_ds,
# parameters=model_ds.parameters(),
# **config["ds_optimizer_params"])

lr_schedule_ds = StepDecay(**config["ds_scheduler_params"])
gradient_clip_ds = nn.ClipGradByGlobalNorm(config["ds_grad_norm"])
optimizer_ds = AdamW(
Expand Down Expand Up @@ -178,7 +183,8 @@ def train_sp(args, config):
},
dataloader=train_dataloader,
ds_train_start_steps=config.ds_train_start_steps,
output_dir=output_dir)
output_dir=output_dir,
only_train_diffusion=config["only_train_diffusion"])

evaluator = DiffSingerEvaluator(
model=model,
Expand Down Expand Up @@ -222,6 +228,10 @@ def main():
type=str,
default=None,
help="speaker id map file for multiple speaker model.")
parser.add_argument(
"--speech-stretchs",
type=str,
help="The min and max values of the mel spectrum.")

args = parser.parse_args()

Expand Down
2 changes: 0 additions & 2 deletions paddlespeech/t2s/exps/syn_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,8 +149,6 @@ def get_test_dataset(test_metadata: List[Dict[str, Any]],
print("single speaker fastspeech2!")
elif am_name == 'diffsinger':
fields = ["utt_id", "text", "note", "note_dur", "is_slur"]
elif am_name == 'fastspeech2midi':
fields = ["utt_id", "text", "note", "note_dur", "is_slur"]
elif am_name == 'speedyspeech':
fields = ["utt_id", "phones", "tones"]
elif am_name == 'tacotron2':
Expand Down
35 changes: 10 additions & 25 deletions paddlespeech/t2s/exps/synthesize.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,44 +112,29 @@ def evaluate(args):
note = paddle.to_tensor(datum["note"])
note_dur = paddle.to_tensor(datum["note_dur"])
is_slur = paddle.to_tensor(datum["is_slur"])
# get_mel_fs2 = False, means mel from diffusion, get_mel_fs2 = True, means mel from fastspeech2.
get_mel_fs2 = False
# mel: [T, mel_bin]
mel1 = am_inference(
phone_ids,
note=note,
note_dur=note_dur,
is_slur=is_slur,
get_mel_fs2=True)
mel2 = am_inference(
mel = am_inference(
phone_ids,
note=note,
note_dur=note_dur,
is_slur=is_slur,
get_mel_fs2=False)
wav1 = voc_inference(mel1)
wav2 = voc_inference(mel2)
get_mel_fs2=get_mel_fs2)
# vocoder
wav = voc_inference(mel)

wav1 = wav1.numpy()
wav2 = wav2.numpy()
N += wav1.size
N += wav2.size
wav = wav.numpy()
N += wav.size
T += t.elapse
speed = 2 * wav1.size / t.elapse
speed = wav.size / t.elapse
rtf = am_config.fs / speed
print(
f"{utt_id}, mel: {mel1.shape}, wave: {wav1.size}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
f"{utt_id}, mel: {mel.shape}, wave: {wav.size}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
)
sf.write(
str(output_dir / (utt_id + "_fs2.wav")),
wav1,
samplerate=am_config.fs)
sf.write(
str(output_dir / (utt_id + "_diffusion.wav")),
wav2,
samplerate=am_config.fs)

str(output_dir / (utt_id + ".wav")), wav, samplerate=am_config.fs)
print(f"{utt_id} done!")
# break
print(f"generation speed: {N / T}Hz, RTF: {am_config.fs / (N / T) }")


Expand Down
2 changes: 1 addition & 1 deletion paddlespeech/t2s/models/diffsinger/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down
Loading

0 comments on commit 9acc852

Please sign in to comment.