From e2612c1144dd017b335c770131049c4ba9e77e04 Mon Sep 17 00:00:00 2001 From: Gerald Shen <119401249+gshennvm@users.noreply.github.com> Date: Fri, 5 Jan 2024 15:49:07 -0800 Subject: [PATCH] add war fix for sync issues (#8130) Signed-off-by: Gerald Shen Signed-off-by: Sasha Meister --- nemo/collections/nlp/parts/nlp_overrides.py | 2 +- nemo/utils/callbacks/nemo_model_checkpoint.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py index eede54aafc2c..293b8a3f5bce 100644 --- a/nemo/collections/nlp/parts/nlp_overrides.py +++ b/nemo/collections/nlp/parts/nlp_overrides.py @@ -426,7 +426,7 @@ def remove_checkpoint(self, filepath: Union[str, Path]) -> None: and self.lightning_module.sharded_state_dict() is not None ): if self.is_global_zero: - shutil.rmtree(ckpt_to_dir(filepath)) + shutil.rmtree(ckpt_to_dir(filepath), ignore_errors=True) # legacy checkpoint logic, does not use megatron core else: diff --git a/nemo/utils/callbacks/nemo_model_checkpoint.py b/nemo/utils/callbacks/nemo_model_checkpoint.py index 87fe1d0a2e06..a290152907db 100644 --- a/nemo/utils/callbacks/nemo_model_checkpoint.py +++ b/nemo/utils/callbacks/nemo_model_checkpoint.py @@ -227,7 +227,7 @@ def _del_model_without_trainer(self, filepath: str) -> None: if is_global_rank_zero(): try: dist_ckpt = ckpt_to_dir(filepath) - shutil.rmtree(dist_ckpt) + shutil.rmtree(dist_ckpt, ignore_errors=True) logging.info(f"Removed distributed checkpoint: {dist_ckpt}") except: logging.info(f"Tried to remove distributed checkpoint: {dist_ckpt} but failed.")