-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathtrain.py
116 lines (105 loc) · 3.74 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
"""
Runs a model on a single node across multiple gpus.
"""
import os
import numpy as np
import warnings
import random
import torch
from argparse import ArgumentParser
from pytorch_lightning import Trainer, seed_everything
from pytorch_lightning.callbacks import (
EarlyStopping,
ModelCheckpoint,
LearningRateMonitor,
)
from pytorch_lightning.loggers import TensorBoardLogger
from project.model.deepspeech_main import DeepSpeech
# warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.simplefilter("ignore", UserWarning)
seed = 17
np.random.seed(seed)
torch.manual_seed(seed)
random.seed(seed)
seed_everything(seed)
def main(args):
""" Main training routine specific for this project. """
# ------------------------
# 1 INIT LIGHTNING MODEL
# ------------------------
model = DeepSpeech(**vars(args))
# ------------------------
# 2 INIT LOGGERS and special CALLBACKS
# ------------------------
# Early stopper
early_stop = EarlyStopping(
monitor=args.early_stop_metric, patience=args.early_stop_patience, verbose=True,
)
# Checkpoint manager
checkpoint_callback = ModelCheckpoint(
verbose=True,
save_top_k=5, # Save 5 Top models
monitor="wer",
mode="min",
period=1,
)
# Loggers
logger = TensorBoardLogger(save_dir=args.logs_path, name=args.experiment_name)
lr_logger = LearningRateMonitor(logging_interval="step")
# ------------------------
# 3 INIT TRAINER
# ------------------------
trainer = Trainer(
gradient_clip_val=0,
auto_scale_batch_size=False,
gpus=1,
auto_select_gpus=True,
log_gpu_memory=True,
# precision=args.precision,
logger=logger,
# checkpoint_callback= checkpoint_callback,
callbacks=[lr_logger, early_stop, checkpoint_callback],
# resume_from_checkpoint='/mnt/data/github/DeepSpeech-pytorch/runs/DeepSpeech_onecycle_defaultbits/version_1/checkpoints/epoch=12.ckpt',
# auto_lr_find='learning_rate',
)
# ------------------------
# 4 START TRAINING
# ------------------------
trainer.fit(model)
def run_cli():
# ------------------------
# TRAINING ARGUMENTS
# ------------------------
# these are project-wide arguments
root_dir = os.path.dirname(os.path.realpath(__file__))
parent_parser = ArgumentParser(add_help=False)
# Model parser
parser = DeepSpeech.add_model_specific_args(parent_parser)
# Data
parser.add_argument("--num_workers", default=4, type=int)
parser.add_argument("--batch_size", default=40, type=int)
parser.add_argument("--data_root", default="/mnt/kingston/datasets/", type=str)
parser.add_argument(
"--data_train",
default=["train-clean-100", "train-clean-360", "train-other-500"],
)
parser.add_argument("--data_test", default=["test-clean"])
# Training params (opt)
parser.add_argument("--epochs", default=100, type=int)
parser.add_argument("--learning_rate", default=0.0005, type=float)
# parser.add_argument("--precission", default=16, type=int)
parser.add_argument("--early_stop_metric", default="wer", type=str)
parser.add_argument("--logs_path", default="runs/", type=str)
parser.add_argument("--experiment_name", default="DeepSpeech", type=str)
parser.add_argument("--early_stop_patience", default=3, type=int)
parser.add_argument("--resume_from_checkpoint", default=None, type=str)
# Precission args
parser.add_argument("--amp_level", default="02", type=str)
parser.add_argument("--precision", default=32, type=int)
args = parser.parse_args()
# ---------------------
# RUN TRAINING
# ---------------------
main(args)
if __name__ == "__main__":
run_cli()