Skip to content

Commit

Permalink
fix a minor bug with async checkpointing where a checkpoint would get…
Browse files Browse the repository at this point in the history
… saved on_train_batch_end and on_validation_end within the same step (#9856) (#9867)

Signed-off-by: ashors1 <ashors@nvidia.com>
Co-authored-by: Anna Shors <71393111+ashors1@users.noreply.github.com>
  • Loading branch information
2 people authored and monica-sekoyan committed Oct 11, 2024
1 parent d398227 commit 2f26d1c
Showing 1 changed file with 2 additions and 1 deletion.
3 changes: 2 additions & 1 deletion nemo/lightning/pytorch/callbacks/model_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -380,6 +380,8 @@ def _save_checkpoint(self, trainer: 'pytorch_lightning.Trainer', filepath: str)
self.set_checkpoint_unfinished_marker(filepath, barrier_after=True)
ema_callback = self._ema_callback(trainer)

self._last_global_step_saved = trainer.global_step

if ema_callback is not None:
if self.async_save:
raise ValueError('async_save with EMA not supported')
Expand Down Expand Up @@ -422,7 +424,6 @@ def _get_finalize_save_checkpoint_callback(

def _cb():
logging.debug(f'Finalize callback called for step {global_step}, filepath {filepath}')
self._last_global_step_saved = global_step
self._last_checkpoint_saved = filepath

from nemo.utils.get_rank import is_global_rank_zero
Expand Down

0 comments on commit 2f26d1c

Please sign in to comment.