forked from Shark-NLP/DiffuSeq
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain.py
executable file
·125 lines (109 loc) · 3.87 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
"""
Train a diffusion model on images.
"""
import argparse
import json, torch, os
import numpy as np
from diffuseq.utils import dist_util, logger
from diffuseq.text_datasets import load_data_text
from diffuseq.step_sample import create_named_schedule_sampler
from basic_utils import (
load_defaults_config,
create_model_and_diffusion,
args_to_dict,
add_dict_to_argparser,
load_model_emb,
load_tokenizer
)
from train_util import TrainLoop
from transformers import set_seed
import wandb
from accelerate import Accelerator
### custom your wandb setting here ###
# os.environ["WANDB_API_KEY"] = ""
os.environ["WANDB_MODE"] = "online"
def create_argparser():
defaults = dict()
defaults.update(load_defaults_config())
parser = argparse.ArgumentParser()
add_dict_to_argparser(parser, defaults) # update latest args according to argparse
return parser
def main():
args = create_argparser().parse_args()
set_seed(args.seed)
# dist_util.setup_dist()
accelerator = Accelerator(device_placement=False)
logger.configure()
logger.log("### Creating data loader...")
tokenizer = load_tokenizer(args)
model_weight, tokenizer = load_model_emb(args, tokenizer)
print("LOAD TRAIN DATA")
data = load_data_text(
batch_size=args.batch_size,
seq_len=args.seq_len,
data_args = args,
loaded_vocab=tokenizer,
model_emb=model_weight # use model's weights as init
)
print("CALL ONE DATA")
next(data)
print("LOAD VALID DATA")
data_valid = load_data_text(
batch_size=args.batch_size,
seq_len=args.seq_len,
data_args=args,
split='valid',
deterministic=True,
loaded_vocab=tokenizer,
model_emb=model_weight # using the same embedding wight with tranining data
)
print('#'*30, 'size of vocab', args.vocab_size)
logger.log("### Creating model and diffusion...")
# print('#'*30, 'CUDA_VISIBLE_DEVICES', os.environ['CUDA_VISIBLE_DEVICES'])
print("ARGS TO DICT")
print(args_to_dict(args, load_defaults_config().keys()))
model, diffusion = create_model_and_diffusion(
**args_to_dict(args, load_defaults_config().keys())
)
print(model)
model.to(dist_util.dev()) # DEBUG **
# model.cuda() # DEBUG **
pytorch_total_params = sum(p.numel() for p in model.parameters())
logger.log(f'### The parameter count is {pytorch_total_params}')
schedule_sampler = create_named_schedule_sampler(args.schedule_sampler, diffusion)
logger.log(f'### Saving the hyperparameters to {args.checkpoint_path}/training_args.json')
with open(f'{args.checkpoint_path}/training_args.json', 'w') as f:
json.dump(args.__dict__, f, indent=2)
if ('LOCAL_RANK' not in os.environ) or (int(os.environ['LOCAL_RANK']) == 0):
wandb.init(
project=os.getenv("WANDB_PROJECT", "DiffuSeq"),
name=args.checkpoint_path,
)
wandb.config.update(args.__dict__, allow_val_change=True)
print("WAIT FOR EVERYONE")
accelerator.wait_for_everyone()
logger.log("### Training...")
TrainLoop(
accelerator=accelerator,
model=model,
diffusion=diffusion,
data=data,
batch_size=args.batch_size,
microbatch=args.microbatch,
lr=args.lr,
ema_rate=args.ema_rate,
log_interval=args.log_interval,
save_interval=args.save_interval,
resume_checkpoint=args.resume_checkpoint,
use_fp16=args.use_fp16,
fp16_scale_growth=args.fp16_scale_growth,
schedule_sampler=schedule_sampler,
weight_decay=args.weight_decay,
learning_steps=args.learning_steps,
checkpoint_path=args.checkpoint_path,
gradient_clipping=args.gradient_clipping,
eval_data=data_valid,
eval_interval=args.eval_interval
).run_loop()
if __name__ == "__main__":
main()