Skip to content

Commit

Permalink
refactoring tests after Coqpit
Browse files Browse the repository at this point in the history
  • Loading branch information
erogol committed May 11, 2021
1 parent 87384c6 commit 5aee304
Show file tree
Hide file tree
Showing 29 changed files with 156 additions and 153 deletions.
8 changes: 1 addition & 7 deletions run_bash_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,7 @@ set -e
TF_CPP_MIN_LOG_LEVEL=3

# runtime bash based tests
# TODO: move these to python
./tests/bash_tests/test_demo_server.sh && \
./tests/bash_tests/test_resample.sh && \
./tests/bash_tests/test_tacotron_train.sh && \
./tests/bash_tests/test_glow-tts_train.sh && \
./tests/bash_tests/test_vocoder_gan_train.sh && \
./tests/bash_tests/test_vocoder_wavernn_train.sh && \
./tests/bash_tests/test_vocoder_wavegrad_train.sh && \
./tests/bash_tests/test_speedy_speech_train.sh && \
./tests/bash_tests/test_aligntts_train.sh && \
./tests/bash_tests/test_compute_statistics.sh
52 changes: 26 additions & 26 deletions tests/inputs/test_config.json
Original file line number Diff line number Diff line change
@@ -1,24 +1,24 @@
{
"audio":{
"audio_processor": "audio", // to use dictate different audio processors, if available.
"num_mels": 80, // size of the mel spec frame.
"fft_size": 1024, // number of stft frequency levels. Size of the linear spectogram frame.
"sample_rate": 22050, // wav sample-rate. If different than the original data, it is resampled.
"frame_length_ms": null, // stft window length in ms.
"frame_shift_ms": null, // stft window hop-lengh in ms.
"audio_processor": "audio",
"num_mels": 80,
"fft_size": 1024,
"sample_rate": 22050,
"frame_length_ms": null,
"frame_shift_ms": null,
"hop_length": 256,
"win_length": 1024,
"preemphasis": 0.97, // pre-emphasis to reduce spec noise and make it more structured. If 0.0, no -pre-emphasis.
"min_level_db": -100, // normalization range
"ref_level_db": 20, // reference level db, theoretically 20db is the sound of air.
"power": 1.5, // value to sharpen wav signals after GL algorithm.
"griffin_lim_iters": 30,// #griffin-lim iterations. 30-60 is a good range. Larger the value, slower the generation.
"signal_norm": true, // normalize the spec values in range [0, 1]
"symmetric_norm": true, // move normalization to range [-1, 1]
"clip_norm": true, // clip normalized values into the range.
"max_norm": 4, // scale normalization to range [-max_norm, max_norm] or [0, max_norm]
"mel_fmin": 0, // minimum freq level for mel-spec. ~50 for male and ~95 for female voices. Tune for dataset!!
"mel_fmax": 8000, // maximum freq level for mel-spec. Tune for dataset!!
"preemphasis": 0.97,
"min_level_db": -100,
"ref_level_db": 20,
"power": 1.5,
"griffin_lim_iters": 30,
"signal_norm": true,
"symmetric_norm": true,
"clip_norm": true,
"max_norm": 4,
"mel_fmin": 0,
"mel_fmax": 8000,
"do_trim_silence": false,
"spec_gain": 20
},
Expand Down Expand Up @@ -53,15 +53,15 @@
"max_seq_len": 300,
"log_dir": "tests/outputs/",

// MULTI-SPEAKER and GST
"use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning.
"use_gst": true, // use global style tokens
"gst": { // gst parameter if gst is enabled
"gst_style_input": null, // Condition the style input either on a
// -> wave file [path to wave] or
// -> dictionary using the style tokens {'token1': 'value', 'token2': 'value'} example {"0": 0.15, "1": 0.15, "5": -0.15}
// with the dictionary being len(dict) <= len(gst_num_style_tokens).
"gst_use_speaker_embedding": true, // if true pass speaker embedding in attention input GST.

"use_speaker_embedding": false,
"use_gst": true,
"gst": {
"gst_style_input": null,



"gst_use_speaker_embedding": true,
"gst_embedding_dim": 512,
"gst_num_heads": 4,
"gst_num_style_tokens": 10
Expand Down
4 changes: 3 additions & 1 deletion tests/inputs/test_speaker_encoder_config.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@

{
"model": "speaker_encoder",
"run_name": "test_speaker_encoder",
"run_description": "test speaker encoder.",
"audio":{
Expand Down Expand Up @@ -42,8 +43,9 @@
"checkpoint": true, // If true, it saves checkpoints per "save_step"
"save_step": 1000, // Number of training steps expected to save traning stats and checkpoints.
"print_step": 20, // Number of steps to log traning on console.
"batch_size": 32,
"output_path": "", // DATASET-RELATED: output path for all training outputs.
"model": {
"model_params": {
"input_dim": 40,
"proj_dim": 256,
"lstm_dim": 768,
Expand Down
1 change: 0 additions & 1 deletion tests/outputs/dummy_model_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,6 @@

// MULTI-SPEAKER and GST
"use_speaker_embedding": false, // use speaker embedding to enable multi-speaker learning.
"use_gst": true, // use global style tokens
"gst": { // gst parameter if gst is enabled
"gst_style_input": null, // Condition the style input either on a
// -> wave file [path to wave] or
Expand Down
16 changes: 8 additions & 8 deletions tests/test_audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,21 @@

from tests import get_tests_input_path, get_tests_output_path, get_tests_path
from TTS.utils.audio import AudioProcessor
from TTS.utils.io import load_config
from TTS.config import BaseAudioConfig

TESTS_PATH = get_tests_path()
OUT_PATH = os.path.join(get_tests_output_path(), "audio_tests")
WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav")

os.makedirs(OUT_PATH, exist_ok=True)
conf = load_config(os.path.join(get_tests_input_path(), "test_config.json"))
conf = BaseAudioConfig(mel_fmax=8000)


# pylint: disable=protected-access
class TestAudio(unittest.TestCase):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.ap = AudioProcessor(**conf.audio)
self.ap = AudioProcessor(**conf)

def test_audio_synthesis(self):
"""1. load wav
Expand Down Expand Up @@ -163,12 +163,12 @@ def test_normalize(self):

def test_scaler(self):
scaler_stats_path = os.path.join(get_tests_input_path(), "scale_stats.npy")
conf.audio["stats_path"] = scaler_stats_path
conf.audio["preemphasis"] = 0.0
conf.audio["do_trim_silence"] = True
conf.audio["signal_norm"] = True
conf.stats_path = scaler_stats_path
conf.preemphasis = 0.0
conf.do_trim_silence = True
conf.signal_norm = True

ap = AudioProcessor(**conf.audio)
ap = AudioProcessor(**conf)
mel_mean, mel_std, linear_mean, linear_std, _ = ap.load_stats(scaler_stats_path)
ap.setup_scaler(mel_mean, mel_std, linear_mean, linear_std)

Expand Down
2 changes: 1 addition & 1 deletion tests/test_glow_tts.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

c = load_config(os.path.join(get_tests_input_path(), "test_config.json"))
c = GlowTTSConfig()

ap = AudioProcessor(**c.audio)
WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav")
Expand Down
10 changes: 7 additions & 3 deletions tests/test_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,17 @@
from TTS.tts.datasets import TTSDataset
from TTS.tts.datasets.preprocess import ljspeech
from TTS.utils.audio import AudioProcessor
from TTS.utils.io import load_config
from TTS.tts.configs import BaseTTSConfig

# pylint: disable=unused-variable

OUTPATH = os.path.join(get_tests_output_path(), "loader_tests/")
os.makedirs(OUTPATH, exist_ok=True)
c = load_config(os.path.join(get_tests_input_path(), "test_config.json"))

# create a dummy config for testing data loaders.
c = BaseTTSConfig(text_cleaner='english_cleaners', num_loader_workers=0, batch_size=2)
c.r = 5
c.data_path = "tests/data/ljspeech/"
ok_ljspeech = os.path.exists(c.data_path)

DATA_EXIST = True
Expand All @@ -40,7 +44,7 @@ def _create_dataloader(self, batch_size, r, bgs):
compute_linear_spec=True,
ap=self.ap,
meta_data=items,
tp=c.characters if "characters" in c.keys() else None,
tp=c.characters,
batch_group_size=bgs,
min_seq_len=c.min_seq_len,
max_seq_len=float("inf"),
Expand Down
11 changes: 6 additions & 5 deletions tests/test_speaker_encoder_train.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
import os
import shutil

from tests import get_tests_output_path, run_cli
from tests import get_tests_output_path, run_cli, get_device_id

from TTS.speaker_encoder.speaker_encoder_config import SpeakerEncoderConfig
from TTS.config.shared_configs import BaseAudioConfig

Expand All @@ -15,9 +16,9 @@
num_speakers_in_batch=1,
num_utters_per_speaker=10,
num_loader_workers=0,
max_train_step=10,
max_train_step=2,
print_step=1,
save_step=10,
save_step=1,
print_eval=True,
audio=BaseAudioConfig(num_mels=40)
)
Expand All @@ -27,7 +28,7 @@

# train the model for one epoch
command_train = (
f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_encoder.py --config_path {config_path} "
f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_encoder.py --config_path {config_path} "
f"--coqpit.output_path {output_path} "
"--coqpit.datasets.0.name ljspeech "
"--coqpit.datasets.0.meta_file_train metadata.csv "
Expand All @@ -41,6 +42,6 @@
continue_path = max(glob.glob(os.path.join(output_path, "*/")), key=os.path.getmtime)

# restore the model and continue training for one more epoch
command_train = f"CUDA_VISIBLE_DEVICES='' python TTS/bin/train_encoder.py --continue_path {continue_path} "
command_train = f"CUDA_VISIBLE_DEVICES='{get_device_id()}' python TTS/bin/train_encoder.py --continue_path {continue_path} "
run_cli(command_train)
shutil.rmtree(continue_path)
6 changes: 3 additions & 3 deletions tests/test_speaker_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,11 @@ class SpeakerManagerTest(unittest.TestCase):
def test_speaker_embedding():
# load config
config = load_config(encoder_config_path)
config["audio"]["resample"] = True
config.audio.resample = True

# create a dummy speaker encoder
model = SpeakerEncoder(**config.model)
save_checkpoint(model, None, None, get_tests_input_path(), 0, 0)
model = SpeakerEncoder(**config.model_params)
save_checkpoint(model, None, None, get_tests_input_path(), 0)

# load audio processor and speaker encoder
ap = AudioProcessor(**config.audio)
Expand Down
3 changes: 2 additions & 1 deletion tests/test_synthesize.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os

from tests import get_tests_output_path, run_cli
from tests import get_tests_output_path, run_cli, get_device_id



def test_synthesize():
Expand Down
11 changes: 5 additions & 6 deletions tests/test_synthesizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ def _create_random_model(self):
# pylint: disable=global-statement
global symbols, phonemes
config = load_config(os.path.join(get_tests_output_path(), "dummy_model_config.json"))
if "characters" in config.keys():
symbols, phonemes = make_symbols(**config.characters)
if config.has('characters') and config.characters:
symbols, phonemes = make_symbols(**config.characters.to_dict())

num_chars = len(phonemes) if config.use_phonemes else len(symbols)
model = setup_model(num_chars, 0, config)
Expand All @@ -25,11 +25,10 @@ def _create_random_model(self):

def test_in_out(self):
self._create_random_model()
config = load_config(os.path.join(get_tests_input_path(), "server_config.json"))
tts_root_path = get_tests_output_path()
config["tts_checkpoint"] = os.path.join(tts_root_path, config["tts_checkpoint"])
config["tts_config"] = os.path.join(tts_root_path, config["tts_config"])
synthesizer = Synthesizer(config["tts_checkpoint"], config["tts_config"], None, None)
tts_checkpoint = os.path.join(tts_root_path, 'checkpoint_10.pth.tar')
tts_config = os.path.join(tts_root_path, 'dummy_model_config.json')
synthesizer = Synthesizer(tts_checkpoint, tts_config, None, None)
synthesizer.tts("Better this test works!!")

def test_split_into_sentences(self):
Expand Down
21 changes: 7 additions & 14 deletions tests/test_tacotron2_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

c = load_config(os.path.join(get_tests_input_path(), "test_config.json"))
c = Tacotron2Config()

ap = AudioProcessor(**c.audio)
WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav")
Expand Down Expand Up @@ -152,10 +152,8 @@ def test_train_step(self):
num_chars=24,
r=c.r,
num_speakers=5,
gst=True,
gst_embedding_dim=c.gst["gst_embedding_dim"],
gst_num_heads=c.gst["gst_num_heads"],
gst_style_tokens=c.gst["gst_style_tokens"],
use_gst=True,
gst=c.gst
).to(device)
model.train()
model_ref = copy.deepcopy(model)
Expand Down Expand Up @@ -216,10 +214,8 @@ def test_train_step(self):
num_chars=24,
r=c.r,
num_speakers=5,
gst=True,
gst_embedding_dim=c.gst["gst_embedding_dim"],
gst_num_heads=c.gst["gst_num_heads"],
gst_style_tokens=c.gst["gst_style_tokens"],
use_gst=True,
gst =c.gst
).to(device)
model.train()
model_ref = copy.deepcopy(model)
Expand Down Expand Up @@ -280,11 +276,8 @@ def test_train_step():
r=c.r,
num_speakers=5,
speaker_embedding_dim=55,
gst=True,
gst_embedding_dim=c.gst["gst_embedding_dim"],
gst_num_heads=c.gst["gst_num_heads"],
gst_style_tokens=c.gst["gst_style_tokens"],
gst_use_speaker_embedding=c.gst["gst_use_speaker_embedding"],
use_gst=True,
gst=c.gst
).to(device)
model.train()
model_ref = copy.deepcopy(model)
Expand Down
2 changes: 1 addition & 1 deletion tests/test_tacotron2_tf_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

c = load_config(os.path.join(get_tests_input_path(), "test_config.json"))
c = Tacotron2Config()


class TacotronTFTrainTest(unittest.TestCase):
Expand Down
21 changes: 7 additions & 14 deletions tests/test_tacotron_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

c = load_config(os.path.join(get_tests_input_path(), "test_config.json"))
c = TacotronConfig()

ap = AudioProcessor(**c.audio)
WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav")
Expand Down Expand Up @@ -175,10 +175,8 @@ def test_train_step():
model = Tacotron(
num_chars=32,
num_speakers=5,
gst=True,
gst_embedding_dim=c.gst["gst_embedding_dim"],
gst_num_heads=c.gst["gst_num_heads"],
gst_style_tokens=c.gst["gst_style_tokens"],
use_gst=True,
gst=c.gst,
postnet_output_dim=c.audio["fft_size"],
decoder_output_dim=c.audio["num_mels"],
r=c.r,
Expand Down Expand Up @@ -240,10 +238,8 @@ def test_train_step():
model = Tacotron(
num_chars=32,
num_speakers=5,
gst=True,
gst_embedding_dim=c.gst["gst_embedding_dim"],
gst_num_heads=c.gst["gst_num_heads"],
gst_style_tokens=c.gst["gst_style_tokens"],
use_gst=True,
gst=c.gst,
postnet_output_dim=c.audio["fft_size"],
decoder_output_dim=c.audio["num_mels"],
r=c.r,
Expand Down Expand Up @@ -306,11 +302,8 @@ def test_train_step():
num_speakers=5,
postnet_output_dim=c.audio["fft_size"],
decoder_output_dim=c.audio["num_mels"],
gst=True,
gst_embedding_dim=c.gst["gst_embedding_dim"],
gst_num_heads=c.gst["gst_num_heads"],
gst_style_tokens=c.gst["gst_style_tokens"],
gst_use_speaker_embedding=c.gst["gst_use_speaker_embedding"],
use_gst=True,
gst=c.gst,
r=c.r,
memory_size=c.memory_size,
speaker_embedding_dim=55,
Expand Down
Loading

0 comments on commit 5aee304

Please sign in to comment.