From 0c6a0628ef17fac513a297f084adaa3a61117e0d Mon Sep 17 00:00:00 2001 From: Abhishek Maurya <124327945+Abhishek-TAMU@users.noreply.github.com> Date: Thu, 26 Sep 2024 13:31:27 -0400 Subject: [PATCH] fix: unable to find output_dir in multi-GPU during resume_from_checkpoint check (#352) * fix: output_dir doesn't exist during resume_from_checkpoint Signed-off-by: Abhishek * fix: fmt Signed-off-by: Abhishek --------- Signed-off-by: Abhishek Signed-off-by: Anh Uong --- build/accelerate_launch.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/build/accelerate_launch.py b/build/accelerate_launch.py index 0a0cfa75..6cbc7d25 100644 --- a/build/accelerate_launch.py +++ b/build/accelerate_launch.py @@ -98,6 +98,8 @@ def main(): # ########## output_dir = job_config.get("output_dir") + if not os.path.exists(output_dir): + os.makedirs(output_dir) try: # checkpoints outputted to tempdir, only final checkpoint copied to output dir launch_command(args)