Skip to content

Commit

Permalink
add war fix for sync issues (NVIDIA#8130)
Browse files Browse the repository at this point in the history
Signed-off-by: Gerald Shen <geshen@nvidia.com>
Signed-off-by: Sasha Meister <ameister@nvidia.com>
  • Loading branch information
gshennvm authored and sashameister committed Feb 15, 2024
1 parent 67084a4 commit e2612c1
Show file tree
Hide file tree
Showing 2 changed files with 2 additions and 2 deletions.
2 changes: 1 addition & 1 deletion nemo/collections/nlp/parts/nlp_overrides.py
Original file line number Diff line number Diff line change
Expand Up @@ -426,7 +426,7 @@ def remove_checkpoint(self, filepath: Union[str, Path]) -> None:
and self.lightning_module.sharded_state_dict() is not None
):
if self.is_global_zero:
shutil.rmtree(ckpt_to_dir(filepath))
shutil.rmtree(ckpt_to_dir(filepath), ignore_errors=True)

# legacy checkpoint logic, does not use megatron core
else:
Expand Down
2 changes: 1 addition & 1 deletion nemo/utils/callbacks/nemo_model_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ def _del_model_without_trainer(self, filepath: str) -> None:
if is_global_rank_zero():
try:
dist_ckpt = ckpt_to_dir(filepath)
shutil.rmtree(dist_ckpt)
shutil.rmtree(dist_ckpt, ignore_errors=True)
logging.info(f"Removed distributed checkpoint: {dist_ckpt}")
except:
logging.info(f"Tried to remove distributed checkpoint: {dist_ckpt} but failed.")
Expand Down

0 comments on commit e2612c1

Please sign in to comment.