From dfe08f395fb7860e5b888f37231ad8564852b95a Mon Sep 17 00:00:00 2001 From: BootsofLagrangian <125134079+BootsofLagrangian@users.noreply.github.com> Date: Sun, 4 Feb 2024 03:12:42 +0900 Subject: [PATCH 01/14] support deepspeed --- fine_tune.py | 41 ++++++++++++++++++++------- library/train_util.py | 54 +++++++++++++++++++++++++++++++++++ sdxl_train.py | 45 ++++++++++++++++++++--------- train_db.py | 37 ++++++++++++++++++------ train_network.py | 66 +++++++++++++++++++++++++++++++------------ 5 files changed, 194 insertions(+), 49 deletions(-) diff --git a/fine_tune.py b/fine_tune.py index 982dc8aec..78dfd1696 100644 --- a/fine_tune.py +++ b/fine_tune.py @@ -102,6 +102,7 @@ def train(args): # mixed precisionに対応した型を用意しておき適宜castする weight_dtype, save_dtype = train_util.prepare_dtype(args) + vae_dtype = torch.float32 if args.no_half_vae else weight_dtype # モデルを読み込む text_encoder, vae, unet, load_stable_diffusion_format = train_util.load_target_model(args, weight_dtype, accelerator) @@ -152,7 +153,7 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module): # 学習を準備する if cache_latents: - vae.to(accelerator.device, dtype=weight_dtype) + vae.to(accelerator.device, dtype=vae_dtype) vae.requires_grad_(False) vae.eval() with torch.no_grad(): @@ -187,7 +188,7 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module): if not cache_latents: vae.requires_grad_(False) vae.eval() - vae.to(accelerator.device, dtype=weight_dtype) + vae.to(accelerator.device, dtype=vae_dtype) for m in training_models: m.requires_grad_(True) @@ -214,7 +215,7 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module): batch_size=1, shuffle=True, collate_fn=collator, - num_workers=n_workers, + num_workers=n_workers if not args.deepspeed else 1, # To avoid RuntimeError: DataLoader worker exited unexpectedly with exit code 1. persistent_workers=args.persistent_data_loader_workers, ) @@ -240,13 +241,33 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module): unet.to(weight_dtype) text_encoder.to(weight_dtype) - # acceleratorがなんかよろしくやってくれるらしい - if args.train_text_encoder: - unet, text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare( - unet, text_encoder, optimizer, train_dataloader, lr_scheduler - ) - else: - unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(unet, optimizer, train_dataloader, lr_scheduler) + if args.deepspeed: + # wrapping model + class DeepSpeedModel(torch.nn.Module): + def __init__(self, unet, text_encoder, vae) -> None: + super().__init__() + self.unet = unet + self.text_encoders = self.text_encoder = torch.nn.ModuleList(text_encoder) + self.vae = vae + def get_models(self): + return self.unet, self.text_encoders, self.vae + + unet.to(accelerator.device, dtype=weight_dtype) + [t_enc.to(accelerator.device, dtype=weight_dtype) for t_enc in text_encoders] + ds_model = DeepSpeedModel(unet, text_encoders, vae) + ds_model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(ds_model, optimizer, train_dataloader, lr_scheduler) + # Now, ds_model is an instance of DeepSpeedEngine. + unet, text_encoders, vae = ds_model.get_models() # for compatiblility + vae.to(vae_dtype) + text_encoder = text_encoders + + else: # acceleratorがなんかよろしくやってくれるらしい + if args.train_text_encoder: + unet, text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare( + unet, text_encoder, optimizer, train_dataloader, lr_scheduler + ) + else: + unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(unet, optimizer, train_dataloader, lr_scheduler) # 実験的機能:勾配も含めたfp16学習を行う PyTorchにパッチを当ててfp16でのgrad scaleを有効にする if args.full_fp16: diff --git a/library/train_util.py b/library/train_util.py index ba428e508..2d85c9776 100644 --- a/library/train_util.py +++ b/library/train_util.py @@ -20,6 +20,7 @@ Union, ) from accelerate import Accelerator, InitProcessGroupKwargs, DistributedDataParallelKwargs +from accelerate import DeepSpeedPlugin import gc import glob import math @@ -3124,6 +3125,47 @@ def add_training_arguments(parser: argparse.ArgumentParser, support_dreambooth: "--prior_loss_weight", type=float, default=1.0, help="loss weight for regularization images / 正則化画像のlossの重み" ) + # DeepSpeed Arguments. https://huggingface.co/docs/accelerate/usage_guides/deepspeed + parser.add_argument("--deepspeed", action="store_true", help="enable deepspeed training") + parser.add_argument( + "--zero_stage", + type=int, default=2, + choices=[0, 1, 2, 3], + help="Possible options are 0,1,2,3." + ) + parser.add_argument( + "--offload_optimizer", + type=str, default=None, + choices=[None, "cpu", "nvme"], + help="Possible options are none|cpu|nvme. Only applicable with ZeRO Stages 2 and 3." + ) + parser.add_argument( + "--offload_optimizer_nvme_path", + type=str, default=None, + help="Possible options are /nvme|/local_nvme. Only applicable with ZeRO Stage 3." + ) + parser.add_argument( + "--offload_param_device", + type=str, default=None, + choices=[None, "cpu", "nvme"], + help="Possible options are none|cpu|nvme. Only applicable with ZeRO Stage 3." + ) + parser.add_argument( + "--offload_param_nvme_path", + type=str, default=None, + help="Possible options are /nvme|/local_nvme. Only applicable with ZeRO Stage 3." + ) + parser.add_argument( + "--zero3_init_flag", + action="store_true", + help="Flag to indicate whether to enable `deepspeed.zero.Init` for constructing massive models." + "Only applicable with ZeRO Stage-3." + ) + parser.add_argument( + "--zero3_save_16bit_model", + action="store_true", + help="Flag to indicate whether to save 16-bit model. Only applicable with ZeRO Stage-3." + ) def verify_training_args(args: argparse.Namespace): if args.v_parameterization and not args.v2: @@ -3912,6 +3954,17 @@ def prepare_accelerator(args: argparse.Namespace): else None, ) kwargs_handlers = list(filter(lambda x: x is not None, kwargs_handlers)) + deepspeed_plugin = None + if args.deepspeed: + deepspeed_plugin = DeepSpeedPlugin( + zero_stage=args.zero_stage, + gradient_accumulation_steps=args.gradient_accumulation_steps, gradient_clipping=args.max_grad_norm, + offload_optimizer=args.offload_optimizer, offload_optimizer_nvme_path=args.offload_optimizer_nvme_path, + offload_param_device=args.offload_param_device, offload_param_nvme_path=args.offload_param_nvme_path, + zero3_init_flag=args.zero3_init_flag, zero3_save_16bit_model=args.zero3_save_16bit_model, + ) + deepspeed_plugin.deepspeed_config['train_micro_batch_size_per_gpu'] = args.train_batch_size + accelerator = Accelerator( gradient_accumulation_steps=args.gradient_accumulation_steps, mixed_precision=args.mixed_precision, @@ -3919,6 +3972,7 @@ def prepare_accelerator(args: argparse.Namespace): project_dir=logging_dir, kwargs_handlers=kwargs_handlers, dynamo_backend=dynamo_backend, + deepspeed_plugin=deepspeed_plugin, ) return accelerator diff --git a/sdxl_train.py b/sdxl_train.py index a3f6f3a17..6ce6c201e 100644 --- a/sdxl_train.py +++ b/sdxl_train.py @@ -354,7 +354,7 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module): batch_size=1, shuffle=True, collate_fn=collator, - num_workers=n_workers, + num_workers=n_workers if not args.deepspeed else 1, # To avoid RuntimeError: DataLoader worker exited unexpectedly with exit code 1. persistent_workers=args.persistent_data_loader_workers, ) @@ -389,18 +389,37 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module): text_encoder1.to(weight_dtype) text_encoder2.to(weight_dtype) - # acceleratorがなんかよろしくやってくれるらしい - if train_unet: - unet = accelerator.prepare(unet) - if train_text_encoder1: - # freeze last layer and final_layer_norm in te1 since we use the output of the penultimate layer - text_encoder1.text_model.encoder.layers[-1].requires_grad_(False) - text_encoder1.text_model.final_layer_norm.requires_grad_(False) - text_encoder1 = accelerator.prepare(text_encoder1) - if train_text_encoder2: - text_encoder2 = accelerator.prepare(text_encoder2) - - optimizer, train_dataloader, lr_scheduler = accelerator.prepare(optimizer, train_dataloader, lr_scheduler) + if args.deepspeed: + # Wrapping model for DeepSpeed + class DeepSpeedModel(torch.nn.Module): + def __init__(self, unet, text_encoder, vae) -> None: + super().__init__() + self.unet = unet + self.text_encoders = self.text_encoder = torch.nn.ModuleList(text_encoder) + self.vae = vae + + def get_models(self): + return self.unet, self.text_encoders, self.vae + text_encoders = [text_encoder1, text_encoder2] + unet.to(accelerator.device, dtype=weight_dtype) + [t_enc.to(accelerator.device, dtype=weight_dtype) for t_enc in text_encoders] + ds_model = DeepSpeedModel(unet, text_encoders, vae) + ds_model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(ds_model, optimizer, train_dataloader, lr_scheduler) + # Now, ds_model is an instance of DeepSpeedEngine. + unet, text_encoders, vae = ds_model.get_models() # for compatiblility + vae.to(vae_dtype) # to avoid explicitly half-vae + text_encoder1, text_encoder2 = text_encoders[0], text_encoders[1] + else: # acceleratorがなんかよろしくやってくれるらしい + if train_unet: + unet = accelerator.prepare(unet) + if train_text_encoder1: + # freeze last layer and final_layer_norm in te1 since we use the output of the penultimate layer + text_encoder1.text_model.encoder.layers[-1].requires_grad_(False) + text_encoder1.text_model.final_layer_norm.requires_grad_(False) + text_encoder1 = accelerator.prepare(text_encoder1) + if train_text_encoder2: + text_encoder2 = accelerator.prepare(text_encoder2) + optimizer, train_dataloader, lr_scheduler = accelerator.prepare(optimizer, train_dataloader, lr_scheduler) # TextEncoderの出力をキャッシュするときにはCPUへ移動する if args.cache_text_encoder_outputs: diff --git a/train_db.py b/train_db.py index 888cad25e..d5f47a179 100644 --- a/train_db.py +++ b/train_db.py @@ -184,7 +184,7 @@ def train(args): batch_size=1, shuffle=True, collate_fn=collator, - num_workers=n_workers, + num_workers=n_workers if not args.deepspeed else 1, # To avoid RuntimeError: DataLoader worker exited unexpectedly with exit code 1. persistent_workers=args.persistent_data_loader_workers, ) @@ -214,15 +214,36 @@ def train(args): text_encoder.to(weight_dtype) # acceleratorがなんかよろしくやってくれるらしい - if train_text_encoder: - unet, text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare( - unet, text_encoder, optimizer, train_dataloader, lr_scheduler - ) + if args.deepspeed: + # wrapping model + class DeepSpeedModel(torch.nn.Module): + def __init__(self, unet, text_encoder, vae) -> None: + super().__init__() + self.unet = unet + self.text_encoders = self.text_encoder = torch.nn.ModuleList(text_encoder) + self.vae = vae + + def get_models(self): + return self.unet, self.text_encoders, self.vae + + unet.to(accelerator.device, dtype=weight_dtype) + [t_enc.to(accelerator.device, dtype=weight_dtype) for t_enc in text_encoders] + ds_model = DeepSpeedModel(unet, text_encoders, vae) + ds_model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(ds_model, optimizer, train_dataloader, lr_scheduler) + # Now, ds_model is an instance of DeepSpeedEngine. + unet, text_encoders, vae = ds_model.get_models() # for compatiblility + vae.to(vae_dtype) # to avoid explicitly half-vae + text_encoder = text_encoders else: - unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(unet, optimizer, train_dataloader, lr_scheduler) + if train_text_encoder: + unet, text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare( + unet, text_encoder, optimizer, train_dataloader, lr_scheduler + ) + else: + unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(unet, optimizer, train_dataloader, lr_scheduler) - if not train_text_encoder: - text_encoder.to(accelerator.device, dtype=weight_dtype) # to avoid 'cpu' vs 'cuda' error + if not train_text_encoder: + text_encoder.to(accelerator.device, dtype=weight_dtype) # to avoid 'cpu' vs 'cuda' error # 実験的機能:勾配も含めたfp16学習を行う PyTorchにパッチを当ててfp16でのgrad scaleを有効にする if args.full_fp16: diff --git a/train_network.py b/train_network.py index 8d102ae8f..05dbe2de7 100644 --- a/train_network.py +++ b/train_network.py @@ -353,18 +353,26 @@ def train(self, args): batch_size=1, shuffle=True, collate_fn=collator, - num_workers=n_workers, + num_workers=n_workers if not args.deepspeed else 1, # To avoid RuntimeError: DataLoader worker exited unexpectedly with exit code 1. persistent_workers=args.persistent_data_loader_workers, ) # 学習ステップ数を計算する if args.max_train_epochs is not None: - args.max_train_steps = args.max_train_epochs * math.ceil( - len(train_dataloader) / accelerator.num_processes / args.gradient_accumulation_steps - ) - accelerator.print( - f"override steps. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}" - ) + if args.deepspeed: + args.max_train_steps = args.max_train_epochs * math.ceil( + len(train_dataloader) / args.gradient_accumulation_steps + ) + accelerator.print( + f"override steps. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}" + ) + else: + args.max_train_steps = args.max_train_epochs * math.ceil( + len(train_dataloader) / accelerator.num_processes / args.gradient_accumulation_steps + ) + accelerator.print( + f"override steps. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}" + ) # データセット側にも学習ステップを送信 train_dataset_group.set_max_train_steps(args.max_train_steps) @@ -409,20 +417,42 @@ def train(self, args): t_enc.text_model.embeddings.to(dtype=(weight_dtype if te_weight_dtype != weight_dtype else te_weight_dtype)) # acceleratorがなんかよろしくやってくれるらしい / accelerator will do something good - if train_unet: - unet = accelerator.prepare(unet) + if args.deepspeed: + # wrapping model + class DeepSpeedModel(torch.nn.Module): + def __init__(self, unet, text_encoder, vae, network) -> None: + super().__init__() + self.unet = unet + self.text_encoders = self.text_encoder = torch.nn.ModuleList(text_encoder) + self.vae = vae + self.network = network + + def get_models(self): + return self.unet, self.text_encoders, self.vae, self.network + + unet.to(accelerator.device, dtype=unet_weight_dtype) + [t_enc.to(accelerator.device, dtype=te_weight_dtype) for t_enc in text_encoders] + ds_model = DeepSpeedModel(unet, text_encoders, vae, network) + ds_model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(ds_model, optimizer, train_dataloader, lr_scheduler) + # Now, ds_model is an instance of DeepSpeedEngine. + unet, text_encoders, vae, network = ds_model.get_models() # for compatiblility + vae.to(vae_dtype) # to avoid explicitly half-vae + text_encoder = text_encoders else: - unet.to(accelerator.device, dtype=unet_weight_dtype) # move to device because unet is not prepared by accelerator - if train_text_encoder: - if len(text_encoders) > 1: - text_encoder = text_encoders = [accelerator.prepare(t_enc) for t_enc in text_encoders] + if train_unet: + unet = accelerator.prepare(unet) else: - text_encoder = accelerator.prepare(text_encoder) - text_encoders = [text_encoder] - else: - pass # if text_encoder is not trained, no need to prepare. and device and dtype are already set + unet.to(accelerator.device, dtype=unet_weight_dtype) # move to device because unet is not prepared by accelerator + if train_text_encoder: + if len(text_encoders) > 1: + text_encoder = text_encoders = [accelerator.prepare(t_enc) for t_enc in text_encoders] + else: + text_encoder = accelerator.prepare(text_encoder) + text_encoders = [text_encoder] + else: + pass # if text_encoder is not trained, no need to prepare. and device and dtype are already set - network, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(network, optimizer, train_dataloader, lr_scheduler) + network, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(network, optimizer, train_dataloader, lr_scheduler) if args.gradient_checkpointing: # according to TI example in Diffusers, train is required From 64873c1b4317afad99a1d397454ba0c64c6cb0b1 Mon Sep 17 00:00:00 2001 From: BootsofLagrangian Date: Mon, 5 Feb 2024 17:11:50 +0900 Subject: [PATCH 02/14] fix offload_optimizer_device typo --- library/train_util.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/library/train_util.py b/library/train_util.py index 2d85c9776..933a34c48 100644 --- a/library/train_util.py +++ b/library/train_util.py @@ -3134,7 +3134,7 @@ def add_training_arguments(parser: argparse.ArgumentParser, support_dreambooth: help="Possible options are 0,1,2,3." ) parser.add_argument( - "--offload_optimizer", + "--offload_optimizer_device", type=str, default=None, choices=[None, "cpu", "nvme"], help="Possible options are none|cpu|nvme. Only applicable with ZeRO Stages 2 and 3." @@ -3959,7 +3959,7 @@ def prepare_accelerator(args: argparse.Namespace): deepspeed_plugin = DeepSpeedPlugin( zero_stage=args.zero_stage, gradient_accumulation_steps=args.gradient_accumulation_steps, gradient_clipping=args.max_grad_norm, - offload_optimizer=args.offload_optimizer, offload_optimizer_nvme_path=args.offload_optimizer_nvme_path, + offload_optimizer_device=args.offload_optimizer_device, offload_optimizer_nvme_path=args.offload_optimizer_nvme_path, offload_param_device=args.offload_param_device, offload_param_nvme_path=args.offload_param_nvme_path, zero3_init_flag=args.zero3_init_flag, zero3_save_16bit_model=args.zero3_save_16bit_model, ) From 2824312d5eb6df118d7585cde7e84d4cdae6f6c6 Mon Sep 17 00:00:00 2001 From: BootsofLagrangian Date: Mon, 5 Feb 2024 20:13:28 +0900 Subject: [PATCH 03/14] fix vae type error during training sdxl --- library/sdxl_train_util.py | 1 - library/train_util.py | 5 ----- sdxl_train.py | 25 +++++++++++-------------- 3 files changed, 11 insertions(+), 20 deletions(-) diff --git a/library/sdxl_train_util.py b/library/sdxl_train_util.py index 5ad748d15..ff7fef176 100644 --- a/library/sdxl_train_util.py +++ b/library/sdxl_train_util.py @@ -17,7 +17,6 @@ def load_target_model(args, accelerator, model_version: str, weight_dtype): - # load models for each process model_dtype = match_mixed_precision(args, weight_dtype) # prepare fp16/bf16 for pi in range(accelerator.state.num_processes): if pi == accelerator.state.local_process_index: diff --git a/library/train_util.py b/library/train_util.py index 933a34c48..a20edbe15 100644 --- a/library/train_util.py +++ b/library/train_util.py @@ -4042,28 +4042,23 @@ def _load_target_model(args: argparse.Namespace, weight_dtype, device="cpu", une def load_target_model(args, weight_dtype, accelerator, unet_use_linear_projection_in_v2=False): - # load models for each process for pi in range(accelerator.state.num_processes): if pi == accelerator.state.local_process_index: print(f"loading model for process {accelerator.state.local_process_index}/{accelerator.state.num_processes}") - text_encoder, vae, unet, load_stable_diffusion_format = _load_target_model( args, weight_dtype, accelerator.device if args.lowram else "cpu", unet_use_linear_projection_in_v2=unet_use_linear_projection_in_v2, ) - # work on low-ram device if args.lowram: text_encoder.to(accelerator.device) unet.to(accelerator.device) vae.to(accelerator.device) - gc.collect() torch.cuda.empty_cache() accelerator.wait_for_everyone() - return text_encoder, vae, unet, load_stable_diffusion_format diff --git a/sdxl_train.py b/sdxl_train.py index 6ce6c201e..e8680828b 100644 --- a/sdxl_train.py +++ b/sdxl_train.py @@ -392,23 +392,20 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module): if args.deepspeed: # Wrapping model for DeepSpeed class DeepSpeedModel(torch.nn.Module): - def __init__(self, unet, text_encoder, vae) -> None: + def __init__(self, unet, text_encoder) -> None: super().__init__() self.unet = unet self.text_encoders = self.text_encoder = torch.nn.ModuleList(text_encoder) - self.vae = vae def get_models(self): - return self.unet, self.text_encoders, self.vae + return self.unet, self.text_encoders text_encoders = [text_encoder1, text_encoder2] - unet.to(accelerator.device, dtype=weight_dtype) - [t_enc.to(accelerator.device, dtype=weight_dtype) for t_enc in text_encoders] - ds_model = DeepSpeedModel(unet, text_encoders, vae) + ds_model = DeepSpeedModel(unet, text_encoders) ds_model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(ds_model, optimizer, train_dataloader, lr_scheduler) # Now, ds_model is an instance of DeepSpeedEngine. - unet, text_encoders, vae = ds_model.get_models() # for compatiblility - vae.to(vae_dtype) # to avoid explicitly half-vae - text_encoder1, text_encoder2 = text_encoders[0], text_encoders[1] + unet, text_encoders = ds_model.get_models() # for compatiblility + text_encoder1, text_encoder2 = text_encoder = text_encoders + training_models = [unet, text_encoder1, text_encoder2] else: # acceleratorがなんかよろしくやってくれるらしい if train_unet: unet = accelerator.prepare(unet) @@ -493,10 +490,10 @@ def get_models(self): for step, batch in enumerate(train_dataloader): current_step.value = global_step with accelerator.accumulate(*training_models): - if "latents" in batch and batch["latents"] is not None: - latents = batch["latents"].to(accelerator.device).to(dtype=weight_dtype) - else: - with torch.no_grad(): + with torch.no_grad(): # why this block differ within train_network.py? + if "latents" in batch and batch["latents"] is not None: + latents = batch["latents"].to(accelerator.device).to(dtype=weight_dtype) + else: # latentに変換 latents = vae.encode(batch["images"].to(vae_dtype)).latent_dist.sample().to(weight_dtype) @@ -504,7 +501,7 @@ def get_models(self): if torch.any(torch.isnan(latents)): accelerator.print("NaN found in latents, replacing with zeros") latents = torch.nan_to_num(latents, 0, out=latents) - latents = latents * sdxl_model_util.VAE_SCALE_FACTOR + latents = latents * sdxl_model_util.VAE_SCALE_FACTOR if "text_encoder_outputs1_list" not in batch or batch["text_encoder_outputs1_list"] is None: input_ids1 = batch["input_ids"] From 4295f91dcd75a7405aa70d5c5d2c826a618a4bcc Mon Sep 17 00:00:00 2001 From: BootsofLagrangian Date: Mon, 5 Feb 2024 20:19:56 +0900 Subject: [PATCH 04/14] fix all trainer about vae --- fine_tune.py | 29 ++++++++++++++++------------- train_db.py | 29 ++++++++++++++++------------- train_network.py | 15 +++++---------- 3 files changed, 37 insertions(+), 36 deletions(-) diff --git a/fine_tune.py b/fine_tune.py index 78dfd1696..f901ee641 100644 --- a/fine_tune.py +++ b/fine_tune.py @@ -221,10 +221,18 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module): # 学習ステップ数を計算する if args.max_train_epochs is not None: - args.max_train_steps = args.max_train_epochs * math.ceil( - len(train_dataloader) / accelerator.num_processes / args.gradient_accumulation_steps - ) - accelerator.print(f"override steps. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}") + if args.deepspeed: + args.max_train_steps = args.max_train_epochs * math.ceil( + len(train_dataloader) / args.gradient_accumulation_steps + ) + accelerator.print( + f"[DeepSpeed] override steps not dividing by {accelerator.num_processes}. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}" + ) + else: + args.max_train_steps = args.max_train_epochs * math.ceil( + len(train_dataloader) / accelerator.num_processes / args.gradient_accumulation_steps + ) + accelerator.print(f"override steps. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}") # データセット側にも学習ステップを送信 train_dataset_group.set_max_train_steps(args.max_train_steps) @@ -244,21 +252,16 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module): if args.deepspeed: # wrapping model class DeepSpeedModel(torch.nn.Module): - def __init__(self, unet, text_encoder, vae) -> None: + def __init__(self, unet, text_encoder) -> None: super().__init__() self.unet = unet self.text_encoders = self.text_encoder = torch.nn.ModuleList(text_encoder) - self.vae = vae def get_models(self): - return self.unet, self.text_encoders, self.vae - - unet.to(accelerator.device, dtype=weight_dtype) - [t_enc.to(accelerator.device, dtype=weight_dtype) for t_enc in text_encoders] - ds_model = DeepSpeedModel(unet, text_encoders, vae) + return self.unet, self.text_encoders + ds_model = DeepSpeedModel(unet, text_encoders) ds_model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(ds_model, optimizer, train_dataloader, lr_scheduler) # Now, ds_model is an instance of DeepSpeedEngine. - unet, text_encoders, vae = ds_model.get_models() # for compatiblility - vae.to(vae_dtype) + unet, text_encoders = ds_model.get_models() # for compatiblility text_encoder = text_encoders else: # acceleratorがなんかよろしくやってくれるらしい diff --git a/train_db.py b/train_db.py index d5f47a179..fa7f6a8dc 100644 --- a/train_db.py +++ b/train_db.py @@ -190,10 +190,18 @@ def train(args): # 学習ステップ数を計算する if args.max_train_epochs is not None: - args.max_train_steps = args.max_train_epochs * math.ceil( - len(train_dataloader) / accelerator.num_processes / args.gradient_accumulation_steps - ) - accelerator.print(f"override steps. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}") + if args.deepspeed: + args.max_train_steps = args.max_train_epochs * math.ceil( + len(train_dataloader) / args.gradient_accumulation_steps + ) + accelerator.print( + f"[DeepSpeed] override steps not dividing by {accelerator.num_processes}. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}" + ) + else: + args.max_train_steps = args.max_train_epochs * math.ceil( + len(train_dataloader) / accelerator.num_processes / args.gradient_accumulation_steps + ) + accelerator.print(f"override steps. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}") # データセット側にも学習ステップを送信 train_dataset_group.set_max_train_steps(args.max_train_steps) @@ -217,22 +225,17 @@ def train(args): if args.deepspeed: # wrapping model class DeepSpeedModel(torch.nn.Module): - def __init__(self, unet, text_encoder, vae) -> None: + def __init__(self, unet, text_encoder) -> None: super().__init__() self.unet = unet self.text_encoders = self.text_encoder = torch.nn.ModuleList(text_encoder) - self.vae = vae def get_models(self): - return self.unet, self.text_encoders, self.vae - - unet.to(accelerator.device, dtype=weight_dtype) - [t_enc.to(accelerator.device, dtype=weight_dtype) for t_enc in text_encoders] - ds_model = DeepSpeedModel(unet, text_encoders, vae) + return self.unet, self.text_encoders + ds_model = DeepSpeedModel(unet, text_encoders) ds_model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(ds_model, optimizer, train_dataloader, lr_scheduler) # Now, ds_model is an instance of DeepSpeedEngine. - unet, text_encoders, vae = ds_model.get_models() # for compatiblility - vae.to(vae_dtype) # to avoid explicitly half-vae + unet, text_encoders = ds_model.get_models() # for compatiblility text_encoder = text_encoders else: if train_text_encoder: diff --git a/train_network.py b/train_network.py index 05dbe2de7..bbda427aa 100644 --- a/train_network.py +++ b/train_network.py @@ -364,7 +364,7 @@ def train(self, args): len(train_dataloader) / args.gradient_accumulation_steps ) accelerator.print( - f"override steps. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}" + f"[DeepSpeed] override steps not dividing by {accelerator.num_processes}. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}" ) else: args.max_train_steps = args.max_train_epochs * math.ceil( @@ -420,23 +420,18 @@ def train(self, args): if args.deepspeed: # wrapping model class DeepSpeedModel(torch.nn.Module): - def __init__(self, unet, text_encoder, vae, network) -> None: + def __init__(self, unet, text_encoder, network) -> None: super().__init__() self.unet = unet self.text_encoders = self.text_encoder = torch.nn.ModuleList(text_encoder) - self.vae = vae self.network = network def get_models(self): - return self.unet, self.text_encoders, self.vae, self.network - - unet.to(accelerator.device, dtype=unet_weight_dtype) - [t_enc.to(accelerator.device, dtype=te_weight_dtype) for t_enc in text_encoders] - ds_model = DeepSpeedModel(unet, text_encoders, vae, network) + return self.unet, self.text_encoders, self.network + ds_model = DeepSpeedModel(unet, text_encoders, network) ds_model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(ds_model, optimizer, train_dataloader, lr_scheduler) # Now, ds_model is an instance of DeepSpeedEngine. - unet, text_encoders, vae, network = ds_model.get_models() # for compatiblility - vae.to(vae_dtype) # to avoid explicitly half-vae + unet, text_encoders, network = ds_model.get_models() # for compatiblility text_encoder = text_encoders else: if train_unet: From 3970bf40804d9c66e76e0af5e1d0477f19bfa79a Mon Sep 17 00:00:00 2001 From: BootsofLagrangian Date: Mon, 5 Feb 2024 22:40:43 +0900 Subject: [PATCH 05/14] maybe fix branch to run offloading --- library/train_util.py | 2 ++ sdxl_train.py | 6 ++++++ 2 files changed, 8 insertions(+) diff --git a/library/train_util.py b/library/train_util.py index a20edbe15..676652e90 100644 --- a/library/train_util.py +++ b/library/train_util.py @@ -3964,6 +3964,8 @@ def prepare_accelerator(args: argparse.Namespace): zero3_init_flag=args.zero3_init_flag, zero3_save_16bit_model=args.zero3_save_16bit_model, ) deepspeed_plugin.deepspeed_config['train_micro_batch_size_per_gpu'] = args.train_batch_size + deepspeed_plugin.deepspeed_config['train_batch_size'] = \ + args.train_batch_size * args.gradient_accumulation_steps * int(os.environ['WORLD_SIZE']) accelerator = Accelerator( gradient_accumulation_steps=args.gradient_accumulation_steps, diff --git a/sdxl_train.py b/sdxl_train.py index e8680828b..ef3ead380 100644 --- a/sdxl_train.py +++ b/sdxl_train.py @@ -391,6 +391,12 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module): if args.deepspeed: # Wrapping model for DeepSpeed + import deepspeed + if args.offload_optimizer_device is not None: + accelerator.print('[DeepSpeed] start to manually build cpu_adam.') + deepspeed.ops.op_builder.CPUAdamBuilder().load() + accelerator.print('[DeepSpeed] building cpu_adam done.') + class DeepSpeedModel(torch.nn.Module): def __init__(self, unet, text_encoder) -> None: super().__init__() From 7d2a9268b9d8d3c9b78068aaa2f9d43eb8b6101b Mon Sep 17 00:00:00 2001 From: BootsofLagrangian Date: Mon, 5 Feb 2024 22:42:06 +0900 Subject: [PATCH 06/14] apply offloading method runable for all trainer --- fine_tune.py | 5 +++++ train_db.py | 5 +++++ train_network.py | 5 +++++ 3 files changed, 15 insertions(+) diff --git a/fine_tune.py b/fine_tune.py index f901ee641..85febeaaa 100644 --- a/fine_tune.py +++ b/fine_tune.py @@ -251,6 +251,11 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module): if args.deepspeed: # wrapping model + import deepspeed + if args.offload_optimizer_device is not None: + accelerator.print('[DeepSpeed] start to manually build cpu_adam.') + deepspeed.ops.op_builder.CPUAdamBuilder().load() + accelerator.print('[DeepSpeed] building cpu_adam done.') class DeepSpeedModel(torch.nn.Module): def __init__(self, unet, text_encoder) -> None: super().__init__() diff --git a/train_db.py b/train_db.py index fa7f6a8dc..e26618867 100644 --- a/train_db.py +++ b/train_db.py @@ -224,6 +224,11 @@ def train(args): # acceleratorがなんかよろしくやってくれるらしい if args.deepspeed: # wrapping model + import deepspeed + if args.offload_optimizer_device is not None: + accelerator.print('[DeepSpeed] start to manually build cpu_adam.') + deepspeed.ops.op_builder.CPUAdamBuilder().load() + accelerator.print('[DeepSpeed] building cpu_adam done.') class DeepSpeedModel(torch.nn.Module): def __init__(self, unet, text_encoder) -> None: super().__init__() diff --git a/train_network.py b/train_network.py index bbda427aa..050a65111 100644 --- a/train_network.py +++ b/train_network.py @@ -419,6 +419,11 @@ def train(self, args): # acceleratorがなんかよろしくやってくれるらしい / accelerator will do something good if args.deepspeed: # wrapping model + import deepspeed + if args.offload_optimizer_device is not None: + accelerator.print('[DeepSpeed] start to manually build cpu_adam.') + deepspeed.ops.op_builder.CPUAdamBuilder().load() + accelerator.print('[DeepSpeed] building cpu_adam done.') class DeepSpeedModel(torch.nn.Module): def __init__(self, unet, text_encoder, network) -> None: super().__init__() From 62556619bdc876c450bfb1445b16683cf3a98699 Mon Sep 17 00:00:00 2001 From: BootsofLagrangian Date: Wed, 7 Feb 2024 16:42:05 +0900 Subject: [PATCH 07/14] fix full_fp16 compatible and train_step --- fine_tune.py | 16 ++----- library/train_util.py | 7 +++ sdxl_train.py | 3 +- test_pip_requirements.txt | 96 +++++++++++++++++++++++++++++++++++++++ train_db.py | 16 ++----- train_network.py | 23 ++++------ 6 files changed, 121 insertions(+), 40 deletions(-) create mode 100644 test_pip_requirements.txt diff --git a/fine_tune.py b/fine_tune.py index 85febeaaa..eb652742c 100644 --- a/fine_tune.py +++ b/fine_tune.py @@ -221,18 +221,10 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module): # 学習ステップ数を計算する if args.max_train_epochs is not None: - if args.deepspeed: - args.max_train_steps = args.max_train_epochs * math.ceil( - len(train_dataloader) / args.gradient_accumulation_steps - ) - accelerator.print( - f"[DeepSpeed] override steps not dividing by {accelerator.num_processes}. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}" - ) - else: - args.max_train_steps = args.max_train_epochs * math.ceil( - len(train_dataloader) / accelerator.num_processes / args.gradient_accumulation_steps - ) - accelerator.print(f"override steps. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}") + args.max_train_steps = args.max_train_epochs * math.ceil( + len(train_dataloader) / accelerator.num_processes / args.gradient_accumulation_steps + ) + accelerator.print(f"override steps. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}") # データセット側にも学習ステップを送信 train_dataset_group.set_max_train_steps(args.max_train_steps) diff --git a/library/train_util.py b/library/train_util.py index 676652e90..ea6265109 100644 --- a/library/train_util.py +++ b/library/train_util.py @@ -3166,6 +3166,11 @@ def add_training_arguments(parser: argparse.ArgumentParser, support_dreambooth: action="store_true", help="Flag to indicate whether to save 16-bit model. Only applicable with ZeRO Stage-3." ) + parser.add_argument( + "--fp16_master_weights_and_gradients", + action="store_true", + help="fp16_master_and_gradients requires optimizer to support keeping fp16 master and gradients while keeping the optimizer states in fp32." + ) def verify_training_args(args: argparse.Namespace): if args.v_parameterization and not args.v2: @@ -3966,6 +3971,8 @@ def prepare_accelerator(args: argparse.Namespace): deepspeed_plugin.deepspeed_config['train_micro_batch_size_per_gpu'] = args.train_batch_size deepspeed_plugin.deepspeed_config['train_batch_size'] = \ args.train_batch_size * args.gradient_accumulation_steps * int(os.environ['WORLD_SIZE']) + if args.full_fp16 or args.fp16_master_weights_and_gradients: + deepspeed_plugin.deepspeed_config['fp16_master_weights_and_gradients'] = True accelerator = Accelerator( gradient_accumulation_steps=args.gradient_accumulation_steps, diff --git a/sdxl_train.py b/sdxl_train.py index ef3ead380..54902b873 100644 --- a/sdxl_train.py +++ b/sdxl_train.py @@ -437,7 +437,8 @@ def get_models(self): text_encoder2.to(accelerator.device) # 実験的機能:勾配も含めたfp16学習を行う PyTorchにパッチを当ててfp16でのgrad scaleを有効にする - if args.full_fp16: + if args.full_fp16 and not args.deepspeed: + # During deepseed training, accelerate not handles fp16/bf16|mixed precision directly via scaler. Let deepspeed engine do. train_util.patch_accelerator_for_fp16_training(accelerator) # resumeする diff --git a/test_pip_requirements.txt b/test_pip_requirements.txt new file mode 100644 index 000000000..6abec3516 --- /dev/null +++ b/test_pip_requirements.txt @@ -0,0 +1,96 @@ +absl-py==2.1.0 +accelerate==0.25.0 +aiohttp==3.9.3 +aiosignal==1.3.1 +altair==4.2.2 +annotated-types @ file:///home/conda/feedstock_root/build_artifacts/annotated-types_1696634205638/work +async-timeout==4.0.3 +attrs==23.2.0 +bitsandbytes==0.42.0 +Brotli @ file:///home/conda/feedstock_root/build_artifacts/brotli-split_1695989787169/work +cachetools==5.3.2 +certifi==2022.12.7 +charset-normalizer==2.1.1 +cmake==3.25.0 +deepspeed==0.13.1 +diffusers==0.25.0 +easygui==0.98.3 +einops==0.6.1 +entrypoints==0.4 +filelock==3.9.0 +frozenlist==1.4.1 +fsspec==2024.2.0 +ftfy==6.1.1 +gmpy2 @ file:///home/conda/feedstock_root/build_artifacts/gmpy2_1666808654411/work +google-auth==2.27.0 +google-auth-oauthlib==0.4.6 +grpcio==1.60.1 +hjson==3.1.0 +huggingface-hub==0.20.1 +idna==3.4 +importlib-metadata==7.0.1 +Jinja2==3.1.2 +jsonschema==4.21.1 +jsonschema-specifications==2023.12.1 +-e git+https://github.com/kohya-ss/sd-scripts@cd19df49cd512e13ac90db115c424d19c0e8868a#egg=library +lightning-utilities==0.10.1 +lit==15.0.7 +Markdown==3.5.2 +MarkupSafe==2.1.3 +mpmath==1.3.0 +multidict==6.0.5 +networkx==3.2.1 +ninja==1.11.1.1 +numpy==1.26.3 +oauthlib==3.2.2 +open-clip-torch==2.20.0 +opencv-python==4.7.0.68 +packaging==23.2 +pandas==2.2.0 +pillow==10.2.0 +protobuf==3.19.6 +psutil==5.9.8 +py-cpuinfo @ file:///home/conda/feedstock_root/build_artifacts/py-cpuinfo_1666774466606/work +pyasn1==0.5.1 +pyasn1-modules==0.3.0 +pydantic @ file:///home/conda/feedstock_root/build_artifacts/pydantic_1706543943340/work +pydantic_core @ file:///home/conda/feedstock_root/build_artifacts/pydantic-core_1705674688239/work +pynvml==11.5.0 +PySocks @ file:///home/conda/feedstock_root/build_artifacts/pysocks_1661604839144/work +python-dateutil==2.8.2 +pytorch-lightning==1.9.0 +pytz==2024.1 +PyYAML @ file:///home/conda/feedstock_root/build_artifacts/pyyaml_1695373428874/work +referencing==0.33.0 +regex==2023.12.25 +requests==2.28.1 +requests-oauthlib==1.3.1 +rpds-py==0.17.1 +rsa==4.9 +safetensors==0.4.2 +scipy==1.12.0 +sentencepiece==0.1.99 +six==1.16.0 +sympy==1.12 +tensorboard==2.10.1 +tensorboard-data-server==0.6.1 +tensorboard-plugin-wit==1.8.1 +timm==0.9.12 +tokenizers==0.15.1 +toml==0.10.2 +toolz==0.12.1 +torch==2.0.1+cu118 +torchaudio==2.2.0 +torchmetrics==1.3.0.post0 +torchvision==0.15.2+cu118 +tqdm==4.66.1 +transformers==4.36.2 +triton==2.0.0 +typing_extensions==4.8.0 +tzdata==2023.4 +urllib3==1.26.13 +voluptuous==0.13.1 +wcwidth==0.2.13 +Werkzeug==3.0.1 +yarl==1.9.4 +zipp==3.17.0 diff --git a/train_db.py b/train_db.py index e26618867..58536555e 100644 --- a/train_db.py +++ b/train_db.py @@ -190,18 +190,10 @@ def train(args): # 学習ステップ数を計算する if args.max_train_epochs is not None: - if args.deepspeed: - args.max_train_steps = args.max_train_epochs * math.ceil( - len(train_dataloader) / args.gradient_accumulation_steps - ) - accelerator.print( - f"[DeepSpeed] override steps not dividing by {accelerator.num_processes}. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}" - ) - else: - args.max_train_steps = args.max_train_epochs * math.ceil( - len(train_dataloader) / accelerator.num_processes / args.gradient_accumulation_steps - ) - accelerator.print(f"override steps. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}") + args.max_train_steps = args.max_train_epochs * math.ceil( + len(train_dataloader) / accelerator.num_processes / args.gradient_accumulation_steps + ) + accelerator.print(f"override steps. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}") # データセット側にも学習ステップを送信 train_dataset_group.set_max_train_steps(args.max_train_steps) diff --git a/train_network.py b/train_network.py index 050a65111..cc445d39a 100644 --- a/train_network.py +++ b/train_network.py @@ -359,20 +359,12 @@ def train(self, args): # 学習ステップ数を計算する if args.max_train_epochs is not None: - if args.deepspeed: - args.max_train_steps = args.max_train_epochs * math.ceil( - len(train_dataloader) / args.gradient_accumulation_steps - ) - accelerator.print( - f"[DeepSpeed] override steps not dividing by {accelerator.num_processes}. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}" - ) - else: - args.max_train_steps = args.max_train_epochs * math.ceil( - len(train_dataloader) / accelerator.num_processes / args.gradient_accumulation_steps - ) - accelerator.print( - f"override steps. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}" - ) + args.max_train_steps = args.max_train_epochs * math.ceil( + len(train_dataloader) / accelerator.num_processes / args.gradient_accumulation_steps + ) + accelerator.print( + f"override steps. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}" + ) # データセット側にも学習ステップを送信 train_dataset_group.set_max_train_steps(args.max_train_steps) @@ -479,7 +471,8 @@ def get_models(self): vae.to(accelerator.device, dtype=vae_dtype) # 実験的機能:勾配も含めたfp16学習を行う PyTorchにパッチを当ててfp16でのgrad scaleを有効にする - if args.full_fp16: + if args.full_fp16 and not args.deepspeed: + # During deepseed training, accelerate not handles fp16/bf16|mixed precision directly via scaler. Let deepspeed engine do. train_util.patch_accelerator_for_fp16_training(accelerator) # resumeする From 2445a5b74e4c5bb0af24e0b3162c1eaef218b56b Mon Sep 17 00:00:00 2001 From: BootsofLagrangian Date: Wed, 7 Feb 2024 16:48:18 +0900 Subject: [PATCH 08/14] remove test requirements --- test_pip_requirements.txt | 96 --------------------------------------- 1 file changed, 96 deletions(-) delete mode 100644 test_pip_requirements.txt diff --git a/test_pip_requirements.txt b/test_pip_requirements.txt deleted file mode 100644 index 6abec3516..000000000 --- a/test_pip_requirements.txt +++ /dev/null @@ -1,96 +0,0 @@ -absl-py==2.1.0 -accelerate==0.25.0 -aiohttp==3.9.3 -aiosignal==1.3.1 -altair==4.2.2 -annotated-types @ file:///home/conda/feedstock_root/build_artifacts/annotated-types_1696634205638/work -async-timeout==4.0.3 -attrs==23.2.0 -bitsandbytes==0.42.0 -Brotli @ file:///home/conda/feedstock_root/build_artifacts/brotli-split_1695989787169/work -cachetools==5.3.2 -certifi==2022.12.7 -charset-normalizer==2.1.1 -cmake==3.25.0 -deepspeed==0.13.1 -diffusers==0.25.0 -easygui==0.98.3 -einops==0.6.1 -entrypoints==0.4 -filelock==3.9.0 -frozenlist==1.4.1 -fsspec==2024.2.0 -ftfy==6.1.1 -gmpy2 @ file:///home/conda/feedstock_root/build_artifacts/gmpy2_1666808654411/work -google-auth==2.27.0 -google-auth-oauthlib==0.4.6 -grpcio==1.60.1 -hjson==3.1.0 -huggingface-hub==0.20.1 -idna==3.4 -importlib-metadata==7.0.1 -Jinja2==3.1.2 -jsonschema==4.21.1 -jsonschema-specifications==2023.12.1 --e git+https://github.com/kohya-ss/sd-scripts@cd19df49cd512e13ac90db115c424d19c0e8868a#egg=library -lightning-utilities==0.10.1 -lit==15.0.7 -Markdown==3.5.2 -MarkupSafe==2.1.3 -mpmath==1.3.0 -multidict==6.0.5 -networkx==3.2.1 -ninja==1.11.1.1 -numpy==1.26.3 -oauthlib==3.2.2 -open-clip-torch==2.20.0 -opencv-python==4.7.0.68 -packaging==23.2 -pandas==2.2.0 -pillow==10.2.0 -protobuf==3.19.6 -psutil==5.9.8 -py-cpuinfo @ file:///home/conda/feedstock_root/build_artifacts/py-cpuinfo_1666774466606/work -pyasn1==0.5.1 -pyasn1-modules==0.3.0 -pydantic @ file:///home/conda/feedstock_root/build_artifacts/pydantic_1706543943340/work -pydantic_core @ file:///home/conda/feedstock_root/build_artifacts/pydantic-core_1705674688239/work -pynvml==11.5.0 -PySocks @ file:///home/conda/feedstock_root/build_artifacts/pysocks_1661604839144/work -python-dateutil==2.8.2 -pytorch-lightning==1.9.0 -pytz==2024.1 -PyYAML @ file:///home/conda/feedstock_root/build_artifacts/pyyaml_1695373428874/work -referencing==0.33.0 -regex==2023.12.25 -requests==2.28.1 -requests-oauthlib==1.3.1 -rpds-py==0.17.1 -rsa==4.9 -safetensors==0.4.2 -scipy==1.12.0 -sentencepiece==0.1.99 -six==1.16.0 -sympy==1.12 -tensorboard==2.10.1 -tensorboard-data-server==0.6.1 -tensorboard-plugin-wit==1.8.1 -timm==0.9.12 -tokenizers==0.15.1 -toml==0.10.2 -toolz==0.12.1 -torch==2.0.1+cu118 -torchaudio==2.2.0 -torchmetrics==1.3.0.post0 -torchvision==0.15.2+cu118 -tqdm==4.66.1 -transformers==4.36.2 -triton==2.0.0 -typing_extensions==4.8.0 -tzdata==2023.4 -urllib3==1.26.13 -voluptuous==0.13.1 -wcwidth==0.2.13 -Werkzeug==3.0.1 -yarl==1.9.4 -zipp==3.17.0 From a98fecaeb1e818c778c90fe441a71a8bd34615ff Mon Sep 17 00:00:00 2001 From: BootsofLagrangian Date: Wed, 7 Feb 2024 17:19:46 +0900 Subject: [PATCH 09/14] forgot setting mixed_precision for deepspeed. sorry --- library/train_util.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/library/train_util.py b/library/train_util.py index ea6265109..dbe5a61ce 100644 --- a/library/train_util.py +++ b/library/train_util.py @@ -3971,6 +3971,9 @@ def prepare_accelerator(args: argparse.Namespace): deepspeed_plugin.deepspeed_config['train_micro_batch_size_per_gpu'] = args.train_batch_size deepspeed_plugin.deepspeed_config['train_batch_size'] = \ args.train_batch_size * args.gradient_accumulation_steps * int(os.environ['WORLD_SIZE']) + deepspeed_plugin.set_mixed_precision(args.mixed_precision) + if args.mixed_precision.lower() == "fp16": + deepspeed_plugin.deepspeed_config['fp16']['initial_scale_power'] = 0 if args.full_fp16 or args.fp16_master_weights_and_gradients: deepspeed_plugin.deepspeed_config['fp16_master_weights_and_gradients'] = True From 03f0816f86b2d4d8915d81146242fb6f7f99c5ff Mon Sep 17 00:00:00 2001 From: BootsofLagrangian Date: Fri, 9 Feb 2024 17:47:49 +0900 Subject: [PATCH 10/14] the reason not working grad accum steps found. it was becasue of my accelerate settings --- fine_tune.py | 5 +++-- library/train_util.py | 6 +++++- sdxl_train.py | 5 +++-- train_db.py | 5 +++-- 4 files changed, 14 insertions(+), 7 deletions(-) diff --git a/fine_tune.py b/fine_tune.py index eb652742c..741e9c857 100644 --- a/fine_tune.py +++ b/fine_tune.py @@ -224,8 +224,9 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module): args.max_train_steps = args.max_train_epochs * math.ceil( len(train_dataloader) / accelerator.num_processes / args.gradient_accumulation_steps ) - accelerator.print(f"override steps. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}") - + accelerator.print( + f"override steps. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}" + ) # データセット側にも学習ステップを送信 train_dataset_group.set_max_train_steps(args.max_train_steps) diff --git a/library/train_util.py b/library/train_util.py index dbe5a61ce..61c836247 100644 --- a/library/train_util.py +++ b/library/train_util.py @@ -3975,7 +3975,11 @@ def prepare_accelerator(args: argparse.Namespace): if args.mixed_precision.lower() == "fp16": deepspeed_plugin.deepspeed_config['fp16']['initial_scale_power'] = 0 if args.full_fp16 or args.fp16_master_weights_and_gradients: - deepspeed_plugin.deepspeed_config['fp16_master_weights_and_gradients'] = True + if args.offload_optimizer_device == "cpu": + deepspeed_plugin.deepspeed_config['fp16']['fp16_master_weights_and_grads'] = True + print("[DeepSpeed] full fp16 enable.") + else: + print("full fp16, fp16_master_weights_and_grads currently only supported using ZeRO-Offload with DeepSpeedCPUAdam.") accelerator = Accelerator( gradient_accumulation_steps=args.gradient_accumulation_steps, diff --git a/sdxl_train.py b/sdxl_train.py index 54902b873..6ffb1bbaf 100644 --- a/sdxl_train.py +++ b/sdxl_train.py @@ -363,8 +363,9 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module): args.max_train_steps = args.max_train_epochs * math.ceil( len(train_dataloader) / accelerator.num_processes / args.gradient_accumulation_steps ) - accelerator.print(f"override steps. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}") - + accelerator.print( + f"override steps. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}" + ) # データセット側にも学習ステップを送信 train_dataset_group.set_max_train_steps(args.max_train_steps) diff --git a/train_db.py b/train_db.py index 58536555e..c336a1c1c 100644 --- a/train_db.py +++ b/train_db.py @@ -193,8 +193,9 @@ def train(args): args.max_train_steps = args.max_train_epochs * math.ceil( len(train_dataloader) / accelerator.num_processes / args.gradient_accumulation_steps ) - accelerator.print(f"override steps. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}") - + accelerator.print( + f"override steps. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}" + ) # データセット側にも学習ステップを送信 train_dataset_group.set_max_train_steps(args.max_train_steps) From 4d5186d1cf0b0fbda20513def793ac3f5e9d5ea0 Mon Sep 17 00:00:00 2001 From: BootsofLagrangian Date: Thu, 22 Feb 2024 16:20:53 +0900 Subject: [PATCH 11/14] refactored codes, some function moved into train_utils.py --- fine_tune.py | 29 +++++++--------- library/train_util.py | 78 +++++++++++++++++++++++++++++++------------ sdxl_train.py | 43 ++++++++++++------------ train_db.py | 31 ++++++++--------- train_network.py | 34 +++++++++---------- 5 files changed, 119 insertions(+), 96 deletions(-) diff --git a/fine_tune.py b/fine_tune.py index 741e9c857..862607545 100644 --- a/fine_tune.py +++ b/fine_tune.py @@ -243,24 +243,19 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module): text_encoder.to(weight_dtype) if args.deepspeed: - # wrapping model - import deepspeed - if args.offload_optimizer_device is not None: - accelerator.print('[DeepSpeed] start to manually build cpu_adam.') - deepspeed.ops.op_builder.CPUAdamBuilder().load() - accelerator.print('[DeepSpeed] building cpu_adam done.') - class DeepSpeedModel(torch.nn.Module): - def __init__(self, unet, text_encoder) -> None: - super().__init__() - self.unet = unet - self.text_encoders = self.text_encoder = torch.nn.ModuleList(text_encoder) - def get_models(self): - return self.unet, self.text_encoders - ds_model = DeepSpeedModel(unet, text_encoders) + training_models_dict = {} + training_models_dict["unet"] = unet + if args.train_text_encoder: training_models_dict["text_encoder"] = text_encoder + + ds_model = train_util.prepare_deepspeed_model(args, **training_models_dict) ds_model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(ds_model, optimizer, train_dataloader, lr_scheduler) - # Now, ds_model is an instance of DeepSpeedEngine. - unet, text_encoders = ds_model.get_models() # for compatiblility - text_encoder = text_encoders + + training_models = [] + unet = ds_model.models["unet"] + training_models.append(unet) + if args.train_text_encoder: + text_encoder = ds_model.models["text_encoder"] + training_models.append(text_encoder) else: # acceleratorがなんかよろしくやってくれるらしい if args.train_text_encoder: diff --git a/library/train_util.py b/library/train_util.py index 61c836247..334aaa21e 100644 --- a/library/train_util.py +++ b/library/train_util.py @@ -3959,27 +3959,7 @@ def prepare_accelerator(args: argparse.Namespace): else None, ) kwargs_handlers = list(filter(lambda x: x is not None, kwargs_handlers)) - deepspeed_plugin = None - if args.deepspeed: - deepspeed_plugin = DeepSpeedPlugin( - zero_stage=args.zero_stage, - gradient_accumulation_steps=args.gradient_accumulation_steps, gradient_clipping=args.max_grad_norm, - offload_optimizer_device=args.offload_optimizer_device, offload_optimizer_nvme_path=args.offload_optimizer_nvme_path, - offload_param_device=args.offload_param_device, offload_param_nvme_path=args.offload_param_nvme_path, - zero3_init_flag=args.zero3_init_flag, zero3_save_16bit_model=args.zero3_save_16bit_model, - ) - deepspeed_plugin.deepspeed_config['train_micro_batch_size_per_gpu'] = args.train_batch_size - deepspeed_plugin.deepspeed_config['train_batch_size'] = \ - args.train_batch_size * args.gradient_accumulation_steps * int(os.environ['WORLD_SIZE']) - deepspeed_plugin.set_mixed_precision(args.mixed_precision) - if args.mixed_precision.lower() == "fp16": - deepspeed_plugin.deepspeed_config['fp16']['initial_scale_power'] = 0 - if args.full_fp16 or args.fp16_master_weights_and_gradients: - if args.offload_optimizer_device == "cpu": - deepspeed_plugin.deepspeed_config['fp16']['fp16_master_weights_and_grads'] = True - print("[DeepSpeed] full fp16 enable.") - else: - print("full fp16, fp16_master_weights_and_grads currently only supported using ZeRO-Offload with DeepSpeedCPUAdam.") + deepspeed_plugin = prepare_deepspeed_plugin(args) accelerator = Accelerator( gradient_accumulation_steps=args.gradient_accumulation_steps, @@ -3992,6 +3972,62 @@ def prepare_accelerator(args: argparse.Namespace): ) return accelerator +def prepare_deepspeed_plugin(args: argparse.Namespace): + if args.deepspeed is None: return None + try: + import deepspeed + except ImportError as e: + print("deepspeed is not installed. please install deepspeed in your environment with following command. DS_BUILD_OPS=0 pip install deepspeed") + exit(1) + + deepspeed_plugin = DeepSpeedPlugin( + zero_stage=args.zero_stage, + gradient_accumulation_steps=args.gradient_accumulation_steps, gradient_clipping=args.max_grad_norm, + offload_optimizer_device=args.offload_optimizer_device, offload_optimizer_nvme_path=args.offload_optimizer_nvme_path, + offload_param_device=args.offload_param_device, offload_param_nvme_path=args.offload_param_nvme_path, + zero3_init_flag=args.zero3_init_flag, zero3_save_16bit_model=args.zero3_save_16bit_model, + ) + deepspeed_plugin.deepspeed_config['train_micro_batch_size_per_gpu'] = args.train_batch_size + deepspeed_plugin.deepspeed_config['train_batch_size'] = \ + args.train_batch_size * args.gradient_accumulation_steps * int(os.environ['WORLD_SIZE']) + deepspeed_plugin.set_mixed_precision(args.mixed_precision) + if args.mixed_precision.lower() == "fp16": + deepspeed_plugin.deepspeed_config['fp16']['initial_scale_power'] = 0 # preventing overflow. + if args.full_fp16 or args.fp16_master_weights_and_gradients: + if args.offload_optimizer_device == "cpu" and args.zero_stage == 2: + deepspeed_plugin.deepspeed_config['fp16']['fp16_master_weights_and_grads'] = True + print("[DeepSpeed] full fp16 enable.") + else: + print("[DeepSpeed]full fp16, fp16_master_weights_and_grads currently only supported using ZeRO-Offload with DeepSpeedCPUAdam on ZeRO-2 stage.") + + if args.offload_optimizer_device is not None: + print('[DeepSpeed] start to manually build cpu_adam.') + deepspeed.ops.op_builder.CPUAdamBuilder().load() + print('[DeepSpeed] building cpu_adam done.') + + return deepspeed_plugin + +def prepare_deepspeed_model(args: argparse.Namespace, **models): + class DeepSpeedWrapper(torch.nn.Module): + def __init__(self, **kw_models) -> None: + super().__init__() + self.models = torch.nn.ModuleDict() + + for key, model in kw_models.items(): + if isinstance(model, list): + model = torch.nn.ModuleList(model) + assert isinstance(model, torch.nn.Module), f"model must be an instance of torch.nn.Module, but got {key} is {type(model)}" + self.models.update( + torch.nn.ModuleDict( + {key: model} + ) + ) + + def get_models(self): + return self.models + + ds_model = DeepSpeedWrapper(**models) + return ds_model def prepare_dtype(args: argparse.Namespace): weight_dtype = torch.float32 diff --git a/sdxl_train.py b/sdxl_train.py index 6ffb1bbaf..2f1a5ce65 100644 --- a/sdxl_train.py +++ b/sdxl_train.py @@ -391,28 +391,29 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module): text_encoder2.to(weight_dtype) if args.deepspeed: - # Wrapping model for DeepSpeed - import deepspeed - if args.offload_optimizer_device is not None: - accelerator.print('[DeepSpeed] start to manually build cpu_adam.') - deepspeed.ops.op_builder.CPUAdamBuilder().load() - accelerator.print('[DeepSpeed] building cpu_adam done.') - - class DeepSpeedModel(torch.nn.Module): - def __init__(self, unet, text_encoder) -> None: - super().__init__() - self.unet = unet - self.text_encoders = self.text_encoder = torch.nn.ModuleList(text_encoder) - - def get_models(self): - return self.unet, self.text_encoders - text_encoders = [text_encoder1, text_encoder2] - ds_model = DeepSpeedModel(unet, text_encoders) + training_models_dict = {} + if train_unet: + training_models_dict["unet"] = unet + if train_text_encoder1: + text_encoder1.text_model.encoder.layers[-1].requires_grad_(False) + text_encoder1.text_model.final_layer_norm.requires_grad_(False) + training_models_dict["text_encoder1"] = text_encoder1 + if train_text_encoder2: + training_models_dict["text_encoder2"] = text_encoder2 + ds_model = train_util.prepare_deepspeed_model(args, **training_models_dict) ds_model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(ds_model, optimizer, train_dataloader, lr_scheduler) - # Now, ds_model is an instance of DeepSpeedEngine. - unet, text_encoders = ds_model.get_models() # for compatiblility - text_encoder1, text_encoder2 = text_encoder = text_encoders - training_models = [unet, text_encoder1, text_encoder2] + + training_models = [] # override training_models + if train_unet: + unet = ds_model.models["unet"] + training_models.append(unet) + if train_text_encoder1: + text_encoder1 = ds_model.models["text_encoder1"] + training_models.append(text_encoder1) + if train_text_encoder2: + text_encoder2 = ds_model.models["text_encoder2"] + training_models.append(text_encoder2) + else: # acceleratorがなんかよろしくやってくれるらしい if train_unet: unet = accelerator.prepare(unet) diff --git a/train_db.py b/train_db.py index c336a1c1c..f188d7bd9 100644 --- a/train_db.py +++ b/train_db.py @@ -216,25 +216,20 @@ def train(args): # acceleratorがなんかよろしくやってくれるらしい if args.deepspeed: - # wrapping model - import deepspeed - if args.offload_optimizer_device is not None: - accelerator.print('[DeepSpeed] start to manually build cpu_adam.') - deepspeed.ops.op_builder.CPUAdamBuilder().load() - accelerator.print('[DeepSpeed] building cpu_adam done.') - class DeepSpeedModel(torch.nn.Module): - def __init__(self, unet, text_encoder) -> None: - super().__init__() - self.unet = unet - self.text_encoders = self.text_encoder = torch.nn.ModuleList(text_encoder) - - def get_models(self): - return self.unet, self.text_encoders - ds_model = DeepSpeedModel(unet, text_encoders) + training_models_dict = {} + training_models_dict["unet"] = unet + if train_text_encoder: training_models_dict["text_encoder"] = text_encoder + + ds_model = train_util.prepare_deepspeed_model(args, **training_models_dict) ds_model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(ds_model, optimizer, train_dataloader, lr_scheduler) - # Now, ds_model is an instance of DeepSpeedEngine. - unet, text_encoders = ds_model.get_models() # for compatiblility - text_encoder = text_encoders + + training_models = [] + unet = ds_model.models["unet"] + training_models.append(unet) + if train_text_encoder: + text_encoder = ds_model.models["text_encoder"] + training_models.append(text_encoder) + else: if train_text_encoder: unet, text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare( diff --git a/train_network.py b/train_network.py index cc445d39a..dfa17eb32 100644 --- a/train_network.py +++ b/train_network.py @@ -410,26 +410,22 @@ def train(self, args): # acceleratorがなんかよろしくやってくれるらしい / accelerator will do something good if args.deepspeed: - # wrapping model - import deepspeed - if args.offload_optimizer_device is not None: - accelerator.print('[DeepSpeed] start to manually build cpu_adam.') - deepspeed.ops.op_builder.CPUAdamBuilder().load() - accelerator.print('[DeepSpeed] building cpu_adam done.') - class DeepSpeedModel(torch.nn.Module): - def __init__(self, unet, text_encoder, network) -> None: - super().__init__() - self.unet = unet - self.text_encoders = self.text_encoder = torch.nn.ModuleList(text_encoder) - self.network = network - - def get_models(self): - return self.unet, self.text_encoders, self.network - ds_model = DeepSpeedModel(unet, text_encoders, network) + training_models_dict = {} + if train_unet: training_models_dict["unet"] = unet + if train_text_encoder: training_models_dict["text_encoder"] = text_encoders + training_models_dict["network"] = network + + ds_model = train_util.prepare_deepspeed_model(args, **training_models_dict) ds_model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(ds_model, optimizer, train_dataloader, lr_scheduler) - # Now, ds_model is an instance of DeepSpeedEngine. - unet, text_encoders, network = ds_model.get_models() # for compatiblility - text_encoder = text_encoders + + if train_unet: unet = ds_model.models["unet"] + if train_text_encoder: + text_encoder = ds_model.models["text_encoder"] + if len(ds_model.models["text_encoder"]) > 1: + text_encoders = text_encoder + else: + text_encoders = [text_encoder] + else: if train_unet: unet = accelerator.prepare(unet) From e3ccf8fbf73a0f728fc167a20b1e0648a3604f41 Mon Sep 17 00:00:00 2001 From: Kohya S Date: Tue, 27 Feb 2024 21:30:46 +0900 Subject: [PATCH 12/14] make deepspeed_utils --- fine_tune.py | 35 +++++----- library/deepspeed_utils.py | 139 +++++++++++++++++++++++++++++++++++++ library/train_util.py | 110 ++--------------------------- sdxl_train.py | 64 ++++++++--------- train_db.py | 39 ++++++----- train_network.py | 51 +++++++------- 6 files changed, 238 insertions(+), 200 deletions(-) create mode 100644 library/deepspeed_utils.py diff --git a/fine_tune.py b/fine_tune.py index c5e97d267..b018a933d 100644 --- a/fine_tune.py +++ b/fine_tune.py @@ -10,7 +10,9 @@ from tqdm import tqdm import torch +from library import deepspeed_utils from library.device_utils import init_ipex, clean_memory_on_device + init_ipex() from accelerate.utils import set_seed @@ -42,6 +44,7 @@ def train(args): train_util.verify_training_args(args) train_util.prepare_dataset_args(args, True) + deepspeed_utils.prepare_deepspeed_args(args) setup_logging(args, reset=True) cache_latents = args.cache_latents @@ -219,7 +222,7 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module): batch_size=1, shuffle=True, collate_fn=collator, - num_workers=n_workers if not args.deepspeed else 1, # To avoid RuntimeError: DataLoader worker exited unexpectedly with exit code 1. + num_workers=n_workers, persistent_workers=args.persistent_data_loader_workers, ) @@ -231,7 +234,7 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module): accelerator.print( f"override steps. steps for {args.max_train_epochs} epochs is / 指定エポックまでのステップ数: {args.max_train_steps}" ) - + # データセット側にも学習ステップを送信 train_dataset_group.set_max_train_steps(args.max_train_steps) @@ -248,21 +251,16 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module): text_encoder.to(weight_dtype) if args.deepspeed: - training_models_dict = {} - training_models_dict["unet"] = unet - if args.train_text_encoder: training_models_dict["text_encoder"] = text_encoder - - ds_model = train_util.prepare_deepspeed_model(args, **training_models_dict) - ds_model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(ds_model, optimizer, train_dataloader, lr_scheduler) - - training_models = [] - unet = ds_model.models["unet"] - training_models.append(unet) if args.train_text_encoder: - text_encoder = ds_model.models["text_encoder"] - training_models.append(text_encoder) - - else: # acceleratorがなんかよろしくやってくれるらしい + ds_model = deepspeed_utils.prepare_deepspeed_model(args, unet=unet, text_encoder=text_encoder) + else: + ds_model = deepspeed_utils.prepare_deepspeed_model(args, unet=unet) + ds_model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare( + ds_model, optimizer, train_dataloader, lr_scheduler + ) + training_models = [ds_model] + else: + # acceleratorがなんかよろしくやってくれるらしい if args.train_text_encoder: unet, text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare( unet, text_encoder, optimizer, train_dataloader, lr_scheduler @@ -327,13 +325,13 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module): for step, batch in enumerate(train_dataloader): current_step.value = global_step - with accelerator.accumulate(training_models[0]): # 複数モデルに対応していない模様だがとりあえずこうしておく + with accelerator.accumulate(*training_models): with torch.no_grad(): if "latents" in batch and batch["latents"] is not None: latents = batch["latents"].to(accelerator.device) # .to(dtype=weight_dtype) else: # latentに変換 - latents = vae.encode(batch["images"].to(dtype=weight_dtype)).latent_dist.sample() + latents = vae.encode(batch["images"].to(dtype=vae_dtype)).latent_dist.sample().to(weight_dtype) latents = latents * 0.18215 b_size = latents.shape[0] @@ -493,6 +491,7 @@ def setup_parser() -> argparse.ArgumentParser: train_util.add_sd_models_arguments(parser) train_util.add_dataset_arguments(parser, False, True, True) train_util.add_training_arguments(parser, False) + deepspeed_utils.add_deepspeed_arguments(parser) train_util.add_sd_saving_arguments(parser) train_util.add_optimizer_arguments(parser) config_util.add_config_arguments(parser) diff --git a/library/deepspeed_utils.py b/library/deepspeed_utils.py new file mode 100644 index 000000000..99a7b2b3b --- /dev/null +++ b/library/deepspeed_utils.py @@ -0,0 +1,139 @@ +import os +import argparse +import torch +from accelerate import DeepSpeedPlugin, Accelerator + +from .utils import setup_logging + +setup_logging() +import logging + +logger = logging.getLogger(__name__) + + +def add_deepspeed_arguments(parser: argparse.ArgumentParser): + # DeepSpeed Arguments. https://huggingface.co/docs/accelerate/usage_guides/deepspeed + parser.add_argument("--deepspeed", action="store_true", help="enable deepspeed training") + parser.add_argument("--zero_stage", type=int, default=2, choices=[0, 1, 2, 3], help="Possible options are 0,1,2,3.") + parser.add_argument( + "--offload_optimizer_device", + type=str, + default=None, + choices=[None, "cpu", "nvme"], + help="Possible options are none|cpu|nvme. Only applicable with ZeRO Stages 2 and 3.", + ) + parser.add_argument( + "--offload_optimizer_nvme_path", + type=str, + default=None, + help="Possible options are /nvme|/local_nvme. Only applicable with ZeRO Stage 3.", + ) + parser.add_argument( + "--offload_param_device", + type=str, + default=None, + choices=[None, "cpu", "nvme"], + help="Possible options are none|cpu|nvme. Only applicable with ZeRO Stage 3.", + ) + parser.add_argument( + "--offload_param_nvme_path", + type=str, + default=None, + help="Possible options are /nvme|/local_nvme. Only applicable with ZeRO Stage 3.", + ) + parser.add_argument( + "--zero3_init_flag", + action="store_true", + help="Flag to indicate whether to enable `deepspeed.zero.Init` for constructing massive models." + "Only applicable with ZeRO Stage-3.", + ) + parser.add_argument( + "--zero3_save_16bit_model", + action="store_true", + help="Flag to indicate whether to save 16-bit model. Only applicable with ZeRO Stage-3.", + ) + parser.add_argument( + "--fp16_master_weights_and_gradients", + action="store_true", + help="fp16_master_and_gradients requires optimizer to support keeping fp16 master and gradients while keeping the optimizer states in fp32.", + ) + + +def prepare_deepspeed_args(args: argparse.Namespace): + if not args.deepspeed: + return + + # To avoid RuntimeError: DataLoader worker exited unexpectedly with exit code 1. + args.max_data_loader_n_workers = 1 + + +def prepare_deepspeed_plugin(args: argparse.Namespace): + if not args.deepspeed: + return None + + try: + import deepspeed + except ImportError as e: + logger.error( + "deepspeed is not installed. please install deepspeed in your environment with following command. DS_BUILD_OPS=0 pip install deepspeed" + ) + exit(1) + + deepspeed_plugin = DeepSpeedPlugin( + zero_stage=args.zero_stage, + gradient_accumulation_steps=args.gradient_accumulation_steps, + gradient_clipping=args.max_grad_norm, + offload_optimizer_device=args.offload_optimizer_device, + offload_optimizer_nvme_path=args.offload_optimizer_nvme_path, + offload_param_device=args.offload_param_device, + offload_param_nvme_path=args.offload_param_nvme_path, + zero3_init_flag=args.zero3_init_flag, + zero3_save_16bit_model=args.zero3_save_16bit_model, + ) + deepspeed_plugin.deepspeed_config["train_micro_batch_size_per_gpu"] = args.train_batch_size + deepspeed_plugin.deepspeed_config["train_batch_size"] = ( + args.train_batch_size * args.gradient_accumulation_steps * int(os.environ["WORLD_SIZE"]) + ) + deepspeed_plugin.set_mixed_precision(args.mixed_precision) + if args.mixed_precision.lower() == "fp16": + deepspeed_plugin.deepspeed_config["fp16"]["initial_scale_power"] = 0 # preventing overflow. + if args.full_fp16 or args.fp16_master_weights_and_gradients: + if args.offload_optimizer_device == "cpu" and args.zero_stage == 2: + deepspeed_plugin.deepspeed_config["fp16"]["fp16_master_weights_and_grads"] = True + logger.info("[DeepSpeed] full fp16 enable.") + else: + logger.info( + "[DeepSpeed]full fp16, fp16_master_weights_and_grads currently only supported using ZeRO-Offload with DeepSpeedCPUAdam on ZeRO-2 stage." + ) + + if args.offload_optimizer_device is not None: + logger.info("[DeepSpeed] start to manually build cpu_adam.") + deepspeed.ops.op_builder.CPUAdamBuilder().load() + logger.info("[DeepSpeed] building cpu_adam done.") + + return deepspeed_plugin + + +# Accelerate library does not support multiple models for deepspeed. So, we need to wrap multiple models into a single model. +def prepare_deepspeed_model(args: argparse.Namespace, **models): + # remove None from models + models = {k: v for k, v in models.items() if v is not None} + + class DeepSpeedWrapper(torch.nn.Module): + def __init__(self, **kw_models) -> None: + super().__init__() + self.models = torch.nn.ModuleDict() + + for key, model in kw_models.items(): + if isinstance(model, list): + model = torch.nn.ModuleList(model) + assert isinstance( + model, torch.nn.Module + ), f"model must be an instance of torch.nn.Module, but got {key} is {type(model)}" + self.models.update(torch.nn.ModuleDict({key: model})) + + def get_models(self): + return self.models + + ds_model = DeepSpeedWrapper(**models) + return ds_model diff --git a/library/train_util.py b/library/train_util.py index 3781dcde8..38e1b458d 100644 --- a/library/train_util.py +++ b/library/train_util.py @@ -21,7 +21,6 @@ Union, ) from accelerate import Accelerator, InitProcessGroupKwargs, DistributedDataParallelKwargs -from accelerate import DeepSpeedPlugin import glob import math import os @@ -70,6 +69,7 @@ import library.model_util as model_util import library.huggingface_util as huggingface_util import library.sai_model_spec as sai_model_spec +import library.deepspeed_utils as deepspeed_utils from library.utils import setup_logging setup_logging() @@ -3243,52 +3243,6 @@ def add_training_arguments(parser: argparse.ArgumentParser, support_dreambooth: "--prior_loss_weight", type=float, default=1.0, help="loss weight for regularization images / 正則化画像のlossの重み" ) - # DeepSpeed Arguments. https://huggingface.co/docs/accelerate/usage_guides/deepspeed - parser.add_argument("--deepspeed", action="store_true", help="enable deepspeed training") - parser.add_argument( - "--zero_stage", - type=int, default=2, - choices=[0, 1, 2, 3], - help="Possible options are 0,1,2,3." - ) - parser.add_argument( - "--offload_optimizer_device", - type=str, default=None, - choices=[None, "cpu", "nvme"], - help="Possible options are none|cpu|nvme. Only applicable with ZeRO Stages 2 and 3." - ) - parser.add_argument( - "--offload_optimizer_nvme_path", - type=str, default=None, - help="Possible options are /nvme|/local_nvme. Only applicable with ZeRO Stage 3." - ) - parser.add_argument( - "--offload_param_device", - type=str, default=None, - choices=[None, "cpu", "nvme"], - help="Possible options are none|cpu|nvme. Only applicable with ZeRO Stage 3." - ) - parser.add_argument( - "--offload_param_nvme_path", - type=str, default=None, - help="Possible options are /nvme|/local_nvme. Only applicable with ZeRO Stage 3." - ) - parser.add_argument( - "--zero3_init_flag", - action="store_true", - help="Flag to indicate whether to enable `deepspeed.zero.Init` for constructing massive models." - "Only applicable with ZeRO Stage-3." - ) - parser.add_argument( - "--zero3_save_16bit_model", - action="store_true", - help="Flag to indicate whether to save 16-bit model. Only applicable with ZeRO Stage-3." - ) - parser.add_argument( - "--fp16_master_weights_and_gradients", - action="store_true", - help="fp16_master_and_gradients requires optimizer to support keeping fp16 master and gradients while keeping the optimizer states in fp32." - ) def verify_training_args(args: argparse.Namespace): r""" @@ -4090,6 +4044,10 @@ def load_tokenizer(args: argparse.Namespace): def prepare_accelerator(args: argparse.Namespace): + """ + this function also prepares deepspeed plugin + """ + if args.logging_dir is None: logging_dir = None else: @@ -4135,7 +4093,7 @@ def prepare_accelerator(args: argparse.Namespace): ), ) kwargs_handlers = list(filter(lambda x: x is not None, kwargs_handlers)) - deepspeed_plugin = prepare_deepspeed_plugin(args) + deepspeed_plugin = deepspeed_utils.prepare_deepspeed_plugin(args) accelerator = Accelerator( gradient_accumulation_steps=args.gradient_accumulation_steps, @@ -4149,62 +4107,6 @@ def prepare_accelerator(args: argparse.Namespace): print("accelerator device:", accelerator.device) return accelerator -def prepare_deepspeed_plugin(args: argparse.Namespace): - if args.deepspeed is None: return None - try: - import deepspeed - except ImportError as e: - print("deepspeed is not installed. please install deepspeed in your environment with following command. DS_BUILD_OPS=0 pip install deepspeed") - exit(1) - - deepspeed_plugin = DeepSpeedPlugin( - zero_stage=args.zero_stage, - gradient_accumulation_steps=args.gradient_accumulation_steps, gradient_clipping=args.max_grad_norm, - offload_optimizer_device=args.offload_optimizer_device, offload_optimizer_nvme_path=args.offload_optimizer_nvme_path, - offload_param_device=args.offload_param_device, offload_param_nvme_path=args.offload_param_nvme_path, - zero3_init_flag=args.zero3_init_flag, zero3_save_16bit_model=args.zero3_save_16bit_model, - ) - deepspeed_plugin.deepspeed_config['train_micro_batch_size_per_gpu'] = args.train_batch_size - deepspeed_plugin.deepspeed_config['train_batch_size'] = \ - args.train_batch_size * args.gradient_accumulation_steps * int(os.environ['WORLD_SIZE']) - deepspeed_plugin.set_mixed_precision(args.mixed_precision) - if args.mixed_precision.lower() == "fp16": - deepspeed_plugin.deepspeed_config['fp16']['initial_scale_power'] = 0 # preventing overflow. - if args.full_fp16 or args.fp16_master_weights_and_gradients: - if args.offload_optimizer_device == "cpu" and args.zero_stage == 2: - deepspeed_plugin.deepspeed_config['fp16']['fp16_master_weights_and_grads'] = True - print("[DeepSpeed] full fp16 enable.") - else: - print("[DeepSpeed]full fp16, fp16_master_weights_and_grads currently only supported using ZeRO-Offload with DeepSpeedCPUAdam on ZeRO-2 stage.") - - if args.offload_optimizer_device is not None: - print('[DeepSpeed] start to manually build cpu_adam.') - deepspeed.ops.op_builder.CPUAdamBuilder().load() - print('[DeepSpeed] building cpu_adam done.') - - return deepspeed_plugin - -def prepare_deepspeed_model(args: argparse.Namespace, **models): - class DeepSpeedWrapper(torch.nn.Module): - def __init__(self, **kw_models) -> None: - super().__init__() - self.models = torch.nn.ModuleDict() - - for key, model in kw_models.items(): - if isinstance(model, list): - model = torch.nn.ModuleList(model) - assert isinstance(model, torch.nn.Module), f"model must be an instance of torch.nn.Module, but got {key} is {type(model)}" - self.models.update( - torch.nn.ModuleDict( - {key: model} - ) - ) - - def get_models(self): - return self.models - - ds_model = DeepSpeedWrapper(**models) - return ds_model def prepare_dtype(args: argparse.Namespace): weight_dtype = torch.float32 diff --git a/sdxl_train.py b/sdxl_train.py index 5e5e9f291..0feb4e367 100644 --- a/sdxl_train.py +++ b/sdxl_train.py @@ -11,11 +11,12 @@ import torch from library.device_utils import init_ipex, clean_memory_on_device + init_ipex() from accelerate.utils import set_seed from diffusers import DDPMScheduler -from library import sdxl_model_util +from library import deepspeed_utils, sdxl_model_util import library.train_util as train_util @@ -97,6 +98,7 @@ def train(args): train_util.verify_training_args(args) train_util.prepare_dataset_args(args, True) sdxl_train_util.verify_sdxl_training_args(args) + deepspeed_utils.prepare_deepspeed_args(args) setup_logging(args, reset=True) assert ( @@ -361,7 +363,7 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module): batch_size=1, shuffle=True, collate_fn=collator, - num_workers=n_workers if not args.deepspeed else 1, # To avoid RuntimeError: DataLoader worker exited unexpectedly with exit code 1. + num_workers=n_workers, persistent_workers=args.persistent_data_loader_workers, ) @@ -398,41 +400,31 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module): text_encoder1.to(weight_dtype) text_encoder2.to(weight_dtype) + # freeze last layer and final_layer_norm in te1 since we use the output of the penultimate layer + if train_text_encoder1: + text_encoder1.text_model.encoder.layers[-1].requires_grad_(False) + text_encoder1.text_model.final_layer_norm.requires_grad_(False) + if args.deepspeed: - training_models_dict = {} - if train_unet: - training_models_dict["unet"] = unet - if train_text_encoder1: - text_encoder1.text_model.encoder.layers[-1].requires_grad_(False) - text_encoder1.text_model.final_layer_norm.requires_grad_(False) - training_models_dict["text_encoder1"] = text_encoder1 - if train_text_encoder2: - training_models_dict["text_encoder2"] = text_encoder2 - ds_model = train_util.prepare_deepspeed_model(args, **training_models_dict) - ds_model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(ds_model, optimizer, train_dataloader, lr_scheduler) - - training_models = [] # override training_models - if train_unet: - unet = ds_model.models["unet"] - training_models.append(unet) - if train_text_encoder1: - text_encoder1 = ds_model.models["text_encoder1"] - training_models.append(text_encoder1) - if train_text_encoder2: - text_encoder2 = ds_model.models["text_encoder2"] - training_models.append(text_encoder2) + ds_model = deepspeed_utils.prepare_deepspeed_model( + args, + unet=unet if train_unet else None, + text_encoder1=text_encoder1 if train_text_encoder1 else None, + text_encoder2=text_encoder2 if train_text_encoder2 else None, + ) + ds_model = accelerator.prepare(ds_model) + training_models = [ds_model] - else: # acceleratorがなんかよろしくやってくれるらしい + else: + # acceleratorがなんかよろしくやってくれるらしい if train_unet: unet = accelerator.prepare(unet) if train_text_encoder1: - # freeze last layer and final_layer_norm in te1 since we use the output of the penultimate layer - text_encoder1.text_model.encoder.layers[-1].requires_grad_(False) - text_encoder1.text_model.final_layer_norm.requires_grad_(False) text_encoder1 = accelerator.prepare(text_encoder1) if train_text_encoder2: text_encoder2 = accelerator.prepare(text_encoder2) - optimizer, train_dataloader, lr_scheduler = accelerator.prepare(optimizer, train_dataloader, lr_scheduler) + + optimizer, train_dataloader, lr_scheduler = accelerator.prepare(optimizer, train_dataloader, lr_scheduler) # TextEncoderの出力をキャッシュするときにはCPUへ移動する if args.cache_text_encoder_outputs: @@ -446,8 +438,9 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module): text_encoder2.to(accelerator.device) # 実験的機能:勾配も含めたfp16学習を行う PyTorchにパッチを当ててfp16でのgrad scaleを有効にする - if args.full_fp16 and not args.deepspeed: + if args.full_fp16: # During deepseed training, accelerate not handles fp16/bf16|mixed precision directly via scaler. Let deepspeed engine do. + # -> But we think it's ok to patch accelerator even if deepspeed is enabled. train_util.patch_accelerator_for_fp16_training(accelerator) # resumeする @@ -508,10 +501,10 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module): for step, batch in enumerate(train_dataloader): current_step.value = global_step with accelerator.accumulate(*training_models): - with torch.no_grad(): # why this block differ within train_network.py? - if "latents" in batch and batch["latents"] is not None: - latents = batch["latents"].to(accelerator.device).to(dtype=weight_dtype) - else: + if "latents" in batch and batch["latents"] is not None: + latents = batch["latents"].to(accelerator.device).to(dtype=weight_dtype) + else: + with torch.no_grad(): # latentに変換 latents = vae.encode(batch["images"].to(vae_dtype)).latent_dist.sample().to(weight_dtype) @@ -519,7 +512,7 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module): if torch.any(torch.isnan(latents)): accelerator.print("NaN found in latents, replacing with zeros") latents = torch.nan_to_num(latents, 0, out=latents) - latents = latents * sdxl_model_util.VAE_SCALE_FACTOR + latents = latents * sdxl_model_util.VAE_SCALE_FACTOR if "text_encoder_outputs1_list" not in batch or batch["text_encoder_outputs1_list"] is None: input_ids1 = batch["input_ids"] @@ -768,6 +761,7 @@ def setup_parser() -> argparse.ArgumentParser: train_util.add_sd_models_arguments(parser) train_util.add_dataset_arguments(parser, True, True, True) train_util.add_training_arguments(parser, False) + deepspeed_utils.add_deepspeed_arguments(parser) train_util.add_sd_saving_arguments(parser) train_util.add_optimizer_arguments(parser) config_util.add_config_arguments(parser) diff --git a/train_db.py b/train_db.py index 66a83d1df..ea1cfeb8a 100644 --- a/train_db.py +++ b/train_db.py @@ -11,7 +11,9 @@ from tqdm import tqdm import torch +from library import deepspeed_utils from library.device_utils import init_ipex, clean_memory_on_device + init_ipex() from accelerate.utils import set_seed @@ -46,6 +48,7 @@ def train(args): train_util.verify_training_args(args) train_util.prepare_dataset_args(args, False) + deepspeed_utils.prepare_deepspeed_args(args) setup_logging(args, reset=True) cache_latents = args.cache_latents @@ -187,7 +190,7 @@ def train(args): batch_size=1, shuffle=True, collate_fn=collator, - num_workers=n_workers if not args.deepspeed else 1, # To avoid RuntimeError: DataLoader worker exited unexpectedly with exit code 1. + num_workers=n_workers, persistent_workers=args.persistent_data_loader_workers, ) @@ -220,30 +223,27 @@ def train(args): # acceleratorがなんかよろしくやってくれるらしい if args.deepspeed: - training_models_dict = {} - training_models_dict["unet"] = unet - if train_text_encoder: training_models_dict["text_encoder"] = text_encoder - - ds_model = train_util.prepare_deepspeed_model(args, **training_models_dict) - ds_model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(ds_model, optimizer, train_dataloader, lr_scheduler) - - training_models = [] - unet = ds_model.models["unet"] - training_models.append(unet) - if train_text_encoder: - text_encoder = ds_model.models["text_encoder"] - training_models.append(text_encoder) - + if args.train_text_encoder: + ds_model = deepspeed_utils.prepare_deepspeed_model(args, unet=unet, text_encoder=text_encoder) + else: + ds_model = deepspeed_utils.prepare_deepspeed_model(args, unet=unet) + ds_model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare( + ds_model, optimizer, train_dataloader, lr_scheduler + ) + training_models = [ds_model] + else: if train_text_encoder: unet, text_encoder, optimizer, train_dataloader, lr_scheduler = accelerator.prepare( unet, text_encoder, optimizer, train_dataloader, lr_scheduler ) + training_models = [unet, text_encoder] else: unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(unet, optimizer, train_dataloader, lr_scheduler) + training_models = [unet] - if not train_text_encoder: - text_encoder.to(accelerator.device, dtype=weight_dtype) # to avoid 'cpu' vs 'cuda' error + if not train_text_encoder: + text_encoder.to(accelerator.device, dtype=weight_dtype) # to avoid 'cpu' vs 'cuda' error # 実験的機能:勾配も含めたfp16学習を行う PyTorchにパッチを当ててfp16でのgrad scaleを有効にする if args.full_fp16: @@ -312,8 +312,10 @@ def train(args): if not args.gradient_checkpointing: text_encoder.train(False) text_encoder.requires_grad_(False) + if len(training_models) == 2: + training_models = training_models[0] # remove text_encoder from training_models - with accelerator.accumulate(unet): + with accelerator.accumulate(*training_models): with torch.no_grad(): # latentに変換 if cache_latents: @@ -480,6 +482,7 @@ def setup_parser() -> argparse.ArgumentParser: train_util.add_sd_models_arguments(parser) train_util.add_dataset_arguments(parser, True, False, True) train_util.add_training_arguments(parser, True) + deepspeed_utils.add_deepspeed_arguments(parser) train_util.add_sd_saving_arguments(parser) train_util.add_optimizer_arguments(parser) config_util.add_config_arguments(parser) diff --git a/train_network.py b/train_network.py index af1b7f635..a6ce169a9 100644 --- a/train_network.py +++ b/train_network.py @@ -13,13 +13,14 @@ import torch from library.device_utils import init_ipex, clean_memory_on_device + init_ipex() from torch.nn.parallel import DistributedDataParallel as DDP from accelerate.utils import set_seed from diffusers import DDPMScheduler -from library import model_util +from library import deepspeed_utils, model_util import library.train_util as train_util from library.train_util import ( @@ -141,6 +142,7 @@ def train(self, args): training_started_at = time.time() train_util.verify_training_args(args) train_util.prepare_dataset_args(args, True) + deepspeed_utils.prepare_deepspeed_args(args) setup_logging(args, reset=True) cache_latents = args.cache_latents @@ -357,7 +359,7 @@ def train(self, args): batch_size=1, shuffle=True, collate_fn=collator, - num_workers=n_workers if not args.deepspeed else 1, # To avoid RuntimeError: DataLoader worker exited unexpectedly with exit code 1. + num_workers=n_workers, persistent_workers=args.persistent_data_loader_workers, ) @@ -414,22 +416,17 @@ def train(self, args): # acceleratorがなんかよろしくやってくれるらしい / accelerator will do something good if args.deepspeed: - training_models_dict = {} - if train_unet: training_models_dict["unet"] = unet - if train_text_encoder: training_models_dict["text_encoder"] = text_encoders - training_models_dict["network"] = network - - ds_model = train_util.prepare_deepspeed_model(args, **training_models_dict) - ds_model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(ds_model, optimizer, train_dataloader, lr_scheduler) - - if train_unet: unet = ds_model.models["unet"] - if train_text_encoder: - text_encoder = ds_model.models["text_encoder"] - if len(ds_model.models["text_encoder"]) > 1: - text_encoders = text_encoder - else: - text_encoders = [text_encoder] - + ds_model = deepspeed_utils.prepare_deepspeed_model( + args, + unet=unet if train_unet else None, + text_encoder1=text_encoders[0] if train_text_encoder else None, + text_encoder2=text_encoders[1] if train_text_encoder and len(text_encoders) > 1 else None, + network=network, + ) + ds_model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare( + ds_model, optimizer, train_dataloader, lr_scheduler + ) + training_model = ds_model else: if train_unet: unet = accelerator.prepare(unet) @@ -444,7 +441,10 @@ def train(self, args): else: pass # if text_encoder is not trained, no need to prepare. and device and dtype are already set - network, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(network, optimizer, train_dataloader, lr_scheduler) + network, optimizer, train_dataloader, lr_scheduler = accelerator.prepare( + network, optimizer, train_dataloader, lr_scheduler + ) + training_model = network if args.gradient_checkpointing: # according to TI example in Diffusers, train is required @@ -777,13 +777,13 @@ def remove_model(old_ckpt_name): for step, batch in enumerate(train_dataloader): current_step.value = global_step - with accelerator.accumulate(network): + with accelerator.accumulate(training_model): on_step_start(text_encoder, unet) - with torch.no_grad(): - if "latents" in batch and batch["latents"] is not None: - latents = batch["latents"].to(accelerator.device) - else: + if "latents" in batch and batch["latents"] is not None: + latents = batch["latents"].to(accelerator.device) + else: + with torch.no_grad(): # latentに変換 latents = vae.encode(batch["images"].to(dtype=vae_dtype)).latent_dist.sample() @@ -791,7 +791,7 @@ def remove_model(old_ckpt_name): if torch.any(torch.isnan(latents)): accelerator.print("NaN found in latents, replacing with zeros") latents = torch.nan_to_num(latents, 0, out=latents) - latents = latents * self.vae_scale_factor + latents = latents * self.vae_scale_factor # get multiplier for each sample if network_has_multiplier: @@ -976,6 +976,7 @@ def setup_parser() -> argparse.ArgumentParser: train_util.add_sd_models_arguments(parser) train_util.add_dataset_arguments(parser, True, True, True) train_util.add_training_arguments(parser, True) + deepspeed_utils.add_deepspeed_arguments(parser) train_util.add_optimizer_arguments(parser) config_util.add_config_arguments(parser) custom_train_functions.add_custom_train_arguments(parser) From d9456020d7547743c809a7c93f9a487276a66c74 Mon Sep 17 00:00:00 2001 From: BootsofLagrangian Date: Wed, 20 Mar 2024 20:52:59 +0900 Subject: [PATCH 13/14] Fix most of ZeRO stage uses optimizer partitioning - we have to prepare optimizer and ds_model at the same time. - pull/1139#issuecomment-1986790007 Signed-off-by: BootsofLagrangian --- sdxl_train.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/sdxl_train.py b/sdxl_train.py index 613fe30b3..2cb80b6b3 100644 --- a/sdxl_train.py +++ b/sdxl_train.py @@ -412,7 +412,10 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module): text_encoder1=text_encoder1 if train_text_encoder1 else None, text_encoder2=text_encoder2 if train_text_encoder2 else None, ) - ds_model = accelerator.prepare(ds_model) + # most of ZeRO stage uses optimizer partitioning, so we have to prepare optimizer and ds_model at the same time. # pull/1139#issuecomment-1986790007 + ds_model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare( + ds_model, optimizer, train_dataloader, lr_scheduler + ) training_models = [ds_model] else: @@ -423,8 +426,7 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module): text_encoder1 = accelerator.prepare(text_encoder1) if train_text_encoder2: text_encoder2 = accelerator.prepare(text_encoder2) - - optimizer, train_dataloader, lr_scheduler = accelerator.prepare(optimizer, train_dataloader, lr_scheduler) + optimizer, train_dataloader, lr_scheduler = accelerator.prepare(optimizer, train_dataloader, lr_scheduler) # TextEncoderの出力をキャッシュするときにはCPUへ移動する if args.cache_text_encoder_outputs: From a2b85316275e6a683fa70204ec192401776c2dc0 Mon Sep 17 00:00:00 2001 From: Kohya S Date: Mon, 25 Mar 2024 22:28:46 +0900 Subject: [PATCH 14/14] make each script consistent, fix to work w/o DeepSpeed --- fine_tune.py | 7 ++++++- library/train_util.py | 2 +- sdxl_train_control_net_lllite.py | 7 ++++--- sdxl_train_control_net_lllite_old.py | 7 ++++--- train_controlnet.py | 4 +++- train_db.py | 2 +- train_network.py | 7 +++---- train_textual_inversion.py | 8 +++++--- train_textual_inversion_XTI.py | 4 +++- 9 files changed, 30 insertions(+), 18 deletions(-) diff --git a/fine_tune.py b/fine_tune.py index e2f2c8a9a..a0350ce18 100644 --- a/fine_tune.py +++ b/fine_tune.py @@ -328,7 +328,7 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module): with accelerator.accumulate(*training_models): with torch.no_grad(): if "latents" in batch and batch["latents"] is not None: - latents = batch["latents"].to(accelerator.device) # .to(dtype=weight_dtype) + latents = batch["latents"].to(accelerator.device).to(dtype=weight_dtype) else: # latentに変換 latents = vae.encode(batch["images"].to(dtype=vae_dtype)).latent_dist.sample().to(weight_dtype) @@ -507,6 +507,11 @@ def setup_parser() -> argparse.ArgumentParser: default=None, help="learning rate for text encoder, default is same as unet / Text Encoderの学習率、デフォルトはunetと同じ", ) + parser.add_argument( + "--no_half_vae", + action="store_true", + help="do not use fp16/bf16 VAE in mixed precision (use float VAE) / mixed precisionでも fp16/bf16 VAEを使わずfloat VAEを使う", + ) return parser diff --git a/library/train_util.py b/library/train_util.py index e10e38daf..6ca5d559e 100644 --- a/library/train_util.py +++ b/library/train_util.py @@ -20,7 +20,7 @@ Tuple, Union, ) -from accelerate import Accelerator, InitProcessGroupKwargs, DistributedDataParallelKwargs +from accelerate import Accelerator, InitProcessGroupKwargs, DistributedDataParallelKwargs, PartialState import glob import math import os diff --git a/sdxl_train_control_net_lllite.py b/sdxl_train_control_net_lllite.py index e99b4e35c..9eaaa19f2 100644 --- a/sdxl_train_control_net_lllite.py +++ b/sdxl_train_control_net_lllite.py @@ -22,7 +22,7 @@ import accelerate from diffusers import DDPMScheduler, ControlNetModel from safetensors.torch import load_file -from library import sai_model_spec, sdxl_model_util, sdxl_original_unet, sdxl_train_util +from library import deepspeed_utils, sai_model_spec, sdxl_model_util, sdxl_original_unet, sdxl_train_util import library.model_util as model_util import library.train_util as train_util @@ -394,10 +394,10 @@ def remove_model(old_ckpt_name): with accelerator.accumulate(unet): with torch.no_grad(): if "latents" in batch and batch["latents"] is not None: - latents = batch["latents"].to(accelerator.device) + latents = batch["latents"].to(accelerator.device).to(dtype=weight_dtype) else: # latentに変換 - latents = vae.encode(batch["images"].to(dtype=vae_dtype)).latent_dist.sample() + latents = vae.encode(batch["images"].to(dtype=vae_dtype)).latent_dist.sample().to(dtype=weight_dtype) # NaNが含まれていれば警告を表示し0に置き換える if torch.any(torch.isnan(latents)): @@ -566,6 +566,7 @@ def setup_parser() -> argparse.ArgumentParser: train_util.add_sd_models_arguments(parser) train_util.add_dataset_arguments(parser, False, True, True) train_util.add_training_arguments(parser, False) + deepspeed_utils.add_deepspeed_arguments(parser) train_util.add_optimizer_arguments(parser) config_util.add_config_arguments(parser) custom_train_functions.add_custom_train_arguments(parser) diff --git a/sdxl_train_control_net_lllite_old.py b/sdxl_train_control_net_lllite_old.py index dac56eedd..e55a58896 100644 --- a/sdxl_train_control_net_lllite_old.py +++ b/sdxl_train_control_net_lllite_old.py @@ -18,7 +18,7 @@ from accelerate.utils import set_seed from diffusers import DDPMScheduler, ControlNetModel from safetensors.torch import load_file -from library import sai_model_spec, sdxl_model_util, sdxl_original_unet, sdxl_train_util +from library import deepspeed_utils, sai_model_spec, sdxl_model_util, sdxl_original_unet, sdxl_train_util import library.model_util as model_util import library.train_util as train_util @@ -361,10 +361,10 @@ def remove_model(old_ckpt_name): with accelerator.accumulate(network): with torch.no_grad(): if "latents" in batch and batch["latents"] is not None: - latents = batch["latents"].to(accelerator.device) + latents = batch["latents"].to(accelerator.device).to(dtype=weight_dtype) else: # latentに変換 - latents = vae.encode(batch["images"].to(dtype=vae_dtype)).latent_dist.sample() + latents = vae.encode(batch["images"].to(dtype=vae_dtype)).latent_dist.sample().to(dtype=weight_dtype) # NaNが含まれていれば警告を表示し0に置き換える if torch.any(torch.isnan(latents)): @@ -534,6 +534,7 @@ def setup_parser() -> argparse.ArgumentParser: train_util.add_sd_models_arguments(parser) train_util.add_dataset_arguments(parser, False, True, True) train_util.add_training_arguments(parser, False) + deepspeed_utils.add_deepspeed_arguments(parser) train_util.add_optimizer_arguments(parser) config_util.add_config_arguments(parser) custom_train_functions.add_custom_train_arguments(parser) diff --git a/train_controlnet.py b/train_controlnet.py index e44f08853..0cb0405fd 100644 --- a/train_controlnet.py +++ b/train_controlnet.py @@ -11,6 +11,7 @@ from tqdm import tqdm import torch +from library import deepspeed_utils from library.device_utils import init_ipex, clean_memory_on_device init_ipex() @@ -396,7 +397,7 @@ def remove_model(old_ckpt_name): with accelerator.accumulate(controlnet): with torch.no_grad(): if "latents" in batch and batch["latents"] is not None: - latents = batch["latents"].to(accelerator.device) + latents = batch["latents"].to(accelerator.device).to(dtype=weight_dtype) else: # latentに変換 latents = vae.encode(batch["images"].to(dtype=weight_dtype)).latent_dist.sample() @@ -584,6 +585,7 @@ def setup_parser() -> argparse.ArgumentParser: train_util.add_sd_models_arguments(parser) train_util.add_dataset_arguments(parser, False, True, True) train_util.add_training_arguments(parser, False) + deepspeed_utils.add_deepspeed_arguments(parser) train_util.add_optimizer_arguments(parser) config_util.add_config_arguments(parser) custom_train_functions.add_custom_train_arguments(parser) diff --git a/train_db.py b/train_db.py index e27bc7ab4..bc7ef4ed5 100644 --- a/train_db.py +++ b/train_db.py @@ -319,7 +319,7 @@ def train(args): with torch.no_grad(): # latentに変換 if cache_latents: - latents = batch["latents"].to(accelerator.device) + latents = batch["latents"].to(accelerator.device).to(dtype=weight_dtype) else: latents = vae.encode(batch["images"].to(dtype=weight_dtype)).latent_dist.sample() latents = latents * 0.18215 diff --git a/train_network.py b/train_network.py index a74a3caf1..f2451f412 100644 --- a/train_network.py +++ b/train_network.py @@ -471,8 +471,7 @@ def train(self, args): vae.to(accelerator.device, dtype=vae_dtype) # 実験的機能:勾配も含めたfp16学習を行う PyTorchにパッチを当ててfp16でのgrad scaleを有効にする - if args.full_fp16 and not args.deepspeed: - # During deepseed training, accelerate not handles fp16/bf16|mixed precision directly via scaler. Let deepspeed engine do. + if args.full_fp16: train_util.patch_accelerator_for_fp16_training(accelerator) # resumeする @@ -781,11 +780,11 @@ def remove_model(old_ckpt_name): on_step_start(text_encoder, unet) if "latents" in batch and batch["latents"] is not None: - latents = batch["latents"].to(accelerator.device) + latents = batch["latents"].to(accelerator.device).to(dtype=weight_dtype) else: with torch.no_grad(): # latentに変換 - latents = vae.encode(batch["images"].to(dtype=vae_dtype)).latent_dist.sample() + latents = vae.encode(batch["images"].to(dtype=vae_dtype)).latent_dist.sample().to(dtype=weight_dtype) # NaNが含まれていれば警告を表示し0に置き換える if torch.any(torch.isnan(latents)): diff --git a/train_textual_inversion.py b/train_textual_inversion.py index 0266bc143..72e26a450 100644 --- a/train_textual_inversion.py +++ b/train_textual_inversion.py @@ -8,12 +8,13 @@ import torch from library.device_utils import init_ipex, clean_memory_on_device + init_ipex() from accelerate.utils import set_seed from diffusers import DDPMScheduler from transformers import CLIPTokenizer -from library import model_util +from library import deepspeed_utils, model_util import library.train_util as train_util import library.huggingface_util as huggingface_util @@ -558,10 +559,10 @@ def remove_model(old_ckpt_name): with accelerator.accumulate(text_encoders[0]): with torch.no_grad(): if "latents" in batch and batch["latents"] is not None: - latents = batch["latents"].to(accelerator.device) + latents = batch["latents"].to(accelerator.device).to(dtype=weight_dtype) else: # latentに変換 - latents = vae.encode(batch["images"].to(dtype=vae_dtype)).latent_dist.sample() + latents = vae.encode(batch["images"].to(dtype=vae_dtype)).latent_dist.sample().to(dtype=weight_dtype) latents = latents * self.vae_scale_factor # Get the text embedding for conditioning @@ -749,6 +750,7 @@ def setup_parser() -> argparse.ArgumentParser: train_util.add_sd_models_arguments(parser) train_util.add_dataset_arguments(parser, True, True, False) train_util.add_training_arguments(parser, True) + deepspeed_utils.add_deepspeed_arguments(parser) train_util.add_optimizer_arguments(parser) config_util.add_config_arguments(parser) custom_train_functions.add_custom_train_arguments(parser, False) diff --git a/train_textual_inversion_XTI.py b/train_textual_inversion_XTI.py index ad7c267eb..00b251ba9 100644 --- a/train_textual_inversion_XTI.py +++ b/train_textual_inversion_XTI.py @@ -8,6 +8,7 @@ from tqdm import tqdm import torch +from library import deepspeed_utils from library.device_utils import init_ipex, clean_memory_on_device init_ipex() @@ -439,7 +440,7 @@ def remove_model(old_ckpt_name): with accelerator.accumulate(text_encoder): with torch.no_grad(): if "latents" in batch and batch["latents"] is not None: - latents = batch["latents"].to(accelerator.device) + latents = batch["latents"].to(accelerator.device).to(dtype=weight_dtype) else: # latentに変換 latents = vae.encode(batch["images"].to(dtype=weight_dtype)).latent_dist.sample() @@ -662,6 +663,7 @@ def setup_parser() -> argparse.ArgumentParser: train_util.add_sd_models_arguments(parser) train_util.add_dataset_arguments(parser, True, True, False) train_util.add_training_arguments(parser, True) + deepspeed_utils.add_deepspeed_arguments(parser) train_util.add_optimizer_arguments(parser) config_util.add_config_arguments(parser) custom_train_functions.add_custom_train_arguments(parser, False)