Skip to content

Commit

Permalink
Fix CheckpointManager bugs (#994)
Browse files Browse the repository at this point in the history
  • Loading branch information
cbalioglu authored Jan 27, 2025
1 parent c1837c1 commit ab502c9
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 7 deletions.
10 changes: 5 additions & 5 deletions src/fairseq2/checkpoint/_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ def __init__(
self._tensor_loader = tensor_loader
self._tensor_dumper = tensor_dumper

if gangs.tp.rank > 1:
if gangs.tp.size > 1:
self._shard_suffix = f".{gangs.tp.rank}"
else:
self._shard_suffix = ""
Expand Down Expand Up @@ -468,15 +468,15 @@ def maybe_with_dp_process_group() -> AbstractContextManager[None]:

def load_part(filename: str) -> dict[str, object]:
with maybe_with_dp_process_group(): # Required for `ShardedTensor`.
file = step_dir.joinpath(filename)

try:
part = self._tensor_loader.load(
step_dir.joinpath(filename), map_location=CPU
)
part = self._tensor_loader.load(file, map_location=CPU)
except FileNotFoundError:
part = {}
except TensorLoadError as ex:
raise CheckpointLoadError(
step_nr, f"The '{filename}' checkpoint file of training step {step_nr} cannot be loaded. See the nested exception for details." # fmt: skip
step_nr, f"The '{file}' checkpoint file of training step {step_nr} cannot be loaded. See the nested exception for details." # fmt: skip
) from ex

self._gangs.root.barrier()
Expand Down
4 changes: 2 additions & 2 deletions src/fairseq2/utils/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,15 +229,15 @@ def load_error() -> TensorLoadError:

try:
fp = self._file_system.open(path)
except FileNotFoundError:
raise
except OSError as ex:
raise load_error() from ex

try:
data: dict[str, object] = torch.load(
fp, map_location, weights_only=self._restrict # type: ignore[arg-type]
)
except FileNotFoundError:
raise
except (RuntimeError, OSError, PickleError) as ex:
raise load_error() from ex
finally:
Expand Down

0 comments on commit ab502c9

Please sign in to comment.