diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py index eede54aafc2c..293b8a3f5bce 100644 --- a/nemo/collections/nlp/parts/nlp_overrides.py +++ b/nemo/collections/nlp/parts/nlp_overrides.py @@ -426,7 +426,7 @@ def remove_checkpoint(self, filepath: Union[str, Path]) -> None: and self.lightning_module.sharded_state_dict() is not None ): if self.is_global_zero: - shutil.rmtree(ckpt_to_dir(filepath)) + shutil.rmtree(ckpt_to_dir(filepath), ignore_errors=True) # legacy checkpoint logic, does not use megatron core else: diff --git a/nemo/utils/callbacks/nemo_model_checkpoint.py b/nemo/utils/callbacks/nemo_model_checkpoint.py index 87fe1d0a2e06..a290152907db 100644 --- a/nemo/utils/callbacks/nemo_model_checkpoint.py +++ b/nemo/utils/callbacks/nemo_model_checkpoint.py @@ -227,7 +227,7 @@ def _del_model_without_trainer(self, filepath: str) -> None: if is_global_rank_zero(): try: dist_ckpt = ckpt_to_dir(filepath) - shutil.rmtree(dist_ckpt) + shutil.rmtree(dist_ckpt, ignore_errors=True) logging.info(f"Removed distributed checkpoint: {dist_ckpt}") except: logging.info(f"Tried to remove distributed checkpoint: {dist_ckpt} but failed.")