From 66fb92413ca4515f37a4b219328cf3fca48825a3 Mon Sep 17 00:00:00 2001 From: Yizhou Wang Date: Tue, 30 Jan 2024 02:13:52 +0800 Subject: [PATCH 1/2] launcher/launcher_helper.py: fix PMI name and add EnvironmentError (#5025) Hi, for my last PR https://github.com/microsoft/DeepSpeed/pull/4699 about launcher_helper, it mistakenly used two "PMIX". In this PR I corrected them to be "PMIX" and "PMI". And I also added _EnvironmentError_ to make sure env not get _NONE_ type, otherwise it would trigger env setting error. --- deepspeed/launcher/launcher_helper.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/deepspeed/launcher/launcher_helper.py b/deepspeed/launcher/launcher_helper.py index d4a8755be2e6..649364a4dd62 100644 --- a/deepspeed/launcher/launcher_helper.py +++ b/deepspeed/launcher/launcher_helper.py @@ -60,6 +60,8 @@ def env_mapping(env, rank_name_list=None, local_rank_name_list=None): rank = env.get(rank_name) elif rank != env.get(rank_name): raise EnvironmentError(f"rank number doesn't match!") + if rank == None: + raise EnvironmentError(f"rank number is not in current env!") env['RANK'] = rank local_rank = None @@ -69,6 +71,8 @@ def env_mapping(env, rank_name_list=None, local_rank_name_list=None): local_rank = env.get(local_rank_name) elif local_rank != env.get(local_rank_name): raise EnvironmentError(f"local_rank number doesn't match!") + if local_rank == None: + raise EnvironmentError(f"rank number is not in current env!") env['LOCAL_RANK'] = local_rank return env @@ -81,7 +85,7 @@ def main(args=None): args.launcher = args.launcher.lower() if args.launcher == MPICH_LAUNCHER: - rank_name_list = ["PMIX_RANK"] + ["PMIX_RANK"] + rank_name_list = ["PMIX_RANK"] + ["PMI_RANK"] local_rank_name_list = ["PALS_LOCAL_RANKID"] + ["MPI_LOCALRANKID"] env = env_mapping(env, rank_name_list=rank_name_list, local_rank_name_list=local_rank_name_list) else: From 8ec1cc3be315e2a3276a771e6de706aae91cd330 Mon Sep 17 00:00:00 2001 From: Max Kovalenko <75629718+deepcharm@users.noreply.github.com> Date: Mon, 29 Jan 2024 20:27:47 +0200 Subject: [PATCH 2/2] Graph capture support on HPU accelerators (#5013) Implementation of the graph capture and replay APIs for HPU accelerators. Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com> --- accelerator/hpu_accelerator.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/accelerator/hpu_accelerator.py b/accelerator/hpu_accelerator.py index e0e1b57b5565..120e038dd227 100644 --- a/accelerator/hpu_accelerator.py +++ b/accelerator/hpu_accelerator.py @@ -174,13 +174,13 @@ def is_triton_supported(self): # Graph operations def create_graph(self): - return None + return self.hpu.HPUGraph() def capture_to_graph(self, graph, pool=None, stream=None): - from deepspeed.runtime.utils import noop_context - return noop_context() + return self.hpu.graph(graph, stream=stream) def replay_graph(self, graph): + graph.replay() return # Tensor operations