From 7d3242976e530289ef1496d7e5e69ecac90329b0 Mon Sep 17 00:00:00 2001 From: Somshubra Majumdar Date: Tue, 27 Jun 2023 20:01:17 -0700 Subject: [PATCH] Add missing save restore connector to eval scripts Signed-off-by: smajumdar --- examples/nlp/language_modeling/megatron_gpt_eval.py | 8 +++++++- examples/nlp/language_modeling/megatron_t5_eval.py | 10 +++++++++- .../language_modeling/tuning/megatron_gpt_ia3_eval.py | 2 +- 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/examples/nlp/language_modeling/megatron_gpt_eval.py b/examples/nlp/language_modeling/megatron_gpt_eval.py index af1657b44d7b..d8bc03bba4f2 100644 --- a/examples/nlp/language_modeling/megatron_gpt_eval.py +++ b/examples/nlp/language_modeling/megatron_gpt_eval.py @@ -173,8 +173,14 @@ def main(cfg) -> None: or cfg.pipeline_model_parallel_size < 0 or cfg.get('pipeline_model_parallel_split_rank', -1) < 0 ): + save_restore_connector = NLPSaveRestoreConnector() + if os.path.isdir(cfg.gpt_model_file): + save_restore_connector.model_extracted_dir = cfg.gpt_model_file model_config = MegatronGPTModel.restore_from( - restore_path=cfg.gpt_model_file, trainer=trainer, return_config=True, + restore_path=cfg.gpt_model_file, + trainer=trainer, + return_config=True, + save_restore_connector=save_restore_connector, ) with open_dict(cfg): diff --git a/examples/nlp/language_modeling/megatron_t5_eval.py b/examples/nlp/language_modeling/megatron_t5_eval.py index 0282f9fb2913..0b6ea54b6b99 100644 --- a/examples/nlp/language_modeling/megatron_t5_eval.py +++ b/examples/nlp/language_modeling/megatron_t5_eval.py @@ -13,6 +13,7 @@ # limitations under the License. +import os from argparse import ArgumentParser import torch @@ -61,8 +62,15 @@ def main(): or args.pipeline_model_parallel_size < 0 or args.pipeline_model_parallel_split_rank < 0 ): + save_restore_connector = NLPSaveRestoreConnector() + if os.path.isdir(args.model_file): + save_restore_connector.model_extracted_dir = args.model_file + model_config = MegatronT5Model.restore_from( - restore_path=args.model_file, trainer=Trainer(strategy=NLPDDPStrategy()), return_config=True, + restore_path=args.model_file, + trainer=Trainer(strategy=NLPDDPStrategy()), + return_config=True, + save_restore_connector=save_restore_connector, ) args.tensor_model_parallel_size = model_config.get('tensor_model_parallel_size', 1) diff --git a/examples/nlp/language_modeling/tuning/megatron_gpt_ia3_eval.py b/examples/nlp/language_modeling/tuning/megatron_gpt_ia3_eval.py index a676fee00a7e..a30818f29fb3 100644 --- a/examples/nlp/language_modeling/tuning/megatron_gpt_ia3_eval.py +++ b/examples/nlp/language_modeling/tuning/megatron_gpt_ia3_eval.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. - +import os import torch import torch.multiprocessing as mp from megatron.core import parallel_state