From ef4c60a573705e36fb3d97571d1568bbc6648ca1 Mon Sep 17 00:00:00 2001 From: Tong Zhu Date: Wed, 16 Aug 2023 02:23:19 +0000 Subject: [PATCH] update fsdp config and scheduled trainer --- conf/fsdp_config.json | 3 +++ scripts/train_backward_Myx.sh | 4 ++-- scripts/train_seed.sh | 4 ++-- src/train.py | 2 +- 4 files changed, 8 insertions(+), 5 deletions(-) create mode 100644 conf/fsdp_config.json diff --git a/conf/fsdp_config.json b/conf/fsdp_config.json new file mode 100644 index 0000000..37baa74 --- /dev/null +++ b/conf/fsdp_config.json @@ -0,0 +1,3 @@ +{ + "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer" +} \ No newline at end of file diff --git a/scripts/train_backward_Myx.sh b/scripts/train_backward_Myx.sh index c579678..1cc97a8 100644 --- a/scripts/train_backward_Myx.sh +++ b/scripts/train_backward_Myx.sh @@ -4,7 +4,7 @@ num_nodes=1 num_gpu_per_node=8 bsz=32 -output_dir="outputs/backward" +output_dir="/dev/shm/tzhu/Humback/outputs/backward_model_on_seed_data_scheduled" bsz_per_dev=$(echo "${bsz} / ${num_nodes} / ${num_gpu_per_node}" | bc) torchrun \ @@ -28,7 +28,7 @@ torchrun \ --logging_strategy steps \ --logging_steps 1 \ --save_strategy epoch \ - --save_total_limit 3 \ + --save_total_limit 1 \ --output_dir ${output_dir} \ --overwrite_output_dir \ --ddp_timeout 30000 \ diff --git a/scripts/train_seed.sh b/scripts/train_seed.sh index 387149f..8251e4e 100644 --- a/scripts/train_seed.sh +++ b/scripts/train_seed.sh @@ -4,7 +4,7 @@ num_nodes=1 num_gpu_per_node=8 bsz=32 -output_dir="outputs/seed_model" +output_dir="/dev/shm/tzhu/Humback/outputs/forward_model_on_seed_data_scheduled" bsz_per_dev=$(echo "${bsz} / ${num_nodes} / ${num_gpu_per_node}" | bc) torchrun \ @@ -27,7 +27,7 @@ torchrun \ --logging_strategy steps \ --logging_steps 1 \ --save_strategy epoch \ - --save_total_limit 3 \ + --save_total_limit 1 \ --output_dir ${output_dir} \ --overwrite_output_dir \ --ddp_timeout 30000 \ diff --git a/src/train.py b/src/train.py index c33bf5f..4abf640 100644 --- a/src/train.py +++ b/src/train.py @@ -369,7 +369,7 @@ def train(): data_module = make_supervised_data_module(tokenizer=tokenizer, data_args=data_args) # Start trainner - trainer = Trainer( + trainer = ScheduledTrainer( model=model, tokenizer=tokenizer, args=training_args, **data_module ) if list(pathlib.Path(training_args.output_dir).glob("checkpoint-*")):