From 2f26d1c0785dd4517fc9ff491d9ce423edba733b Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 29 Jul 2024 17:24:16 -0700 Subject: [PATCH] fix a minor bug with async checkpointing where a checkpoint would get saved on_train_batch_end and on_validation_end within the same step (#9856) (#9867) Signed-off-by: ashors1 Co-authored-by: Anna Shors <71393111+ashors1@users.noreply.github.com> --- nemo/lightning/pytorch/callbacks/model_checkpoint.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nemo/lightning/pytorch/callbacks/model_checkpoint.py b/nemo/lightning/pytorch/callbacks/model_checkpoint.py index eee3850dfb37..f3e8f7e6b40b 100644 --- a/nemo/lightning/pytorch/callbacks/model_checkpoint.py +++ b/nemo/lightning/pytorch/callbacks/model_checkpoint.py @@ -380,6 +380,8 @@ def _save_checkpoint(self, trainer: 'pytorch_lightning.Trainer', filepath: str) self.set_checkpoint_unfinished_marker(filepath, barrier_after=True) ema_callback = self._ema_callback(trainer) + self._last_global_step_saved = trainer.global_step + if ema_callback is not None: if self.async_save: raise ValueError('async_save with EMA not supported') @@ -422,7 +424,6 @@ def _get_finalize_save_checkpoint_callback( def _cb(): logging.debug(f'Finalize callback called for step {global_step}, filepath {filepath}') - self._last_global_step_saved = global_step self._last_checkpoint_saved = filepath from nemo.utils.get_rank import is_global_rank_zero