diff --git a/examples/sft/ds_config_zero3.json b/examples/sft/ds_config_zero3.json index e30fe94e..e588ea63 100644 --- a/examples/sft/ds_config_zero3.json +++ b/examples/sft/ds_config_zero3.json @@ -43,7 +43,7 @@ "contiguous_gradients": true, "sub_group_size": 1e9, "reduce_bucket_size": "auto", - "stage3_prefetch_bucket_size": "auto", + "stage3_prefetch_bucket_size": "0", "stage3_param_persistence_threshold": "auto", "stage3_max_live_parameters": 1e9, "stage3_max_reuse_distance": 1e9, diff --git a/examples/sft/finetune.py b/examples/sft/finetune.py index 5b59e4b2..b896a4bc 100644 --- a/examples/sft/finetune.py +++ b/examples/sft/finetune.py @@ -18,6 +18,8 @@ from transformers.trainer_pt_utils import LabelSmoother from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training from accelerate.utils import DistributedType +from deepspeed.utils import set_z3_leaf_modules +from transformers.models.qwen2_moe.modeling_qwen2_moe import Qwen2MoeSparseMoeBlock IGNORE_TOKEN_ID = LabelSmoother.ignore_index @@ -317,6 +319,11 @@ def train(): else None, **model_load_kwargs, ) + + # Set z3 flag to make sparse MoE layer compatible with Zero3, + # following https://github.com/microsoft/DeepSpeed/pull/5008 + set_z3_leaf_modules(model, [Qwen2MoeSparseMoeBlock]) + tokenizer = AutoTokenizer.from_pretrained( model_args.model_name_or_path, cache_dir=training_args.cache_dir,