From 85cc5f9bb3f0175a2d13ea1ed65bf7d202b7f0d9 Mon Sep 17 00:00:00 2001
From: Hongwei Chen <33092912+hwchen2017@users.noreply.github.com>
Date: Thu, 26 Dec 2024 09:12:04 -0800
Subject: [PATCH 1/3] Fix error caused by all_reduce call in domino (#6880)

Fix #6851
Initialize communication backend to fix error caused by all_reduce call
in the Domino transformer layer.
Verified correctness in local test.

---------

Co-authored-by: Olatunji Ruwase <olruwase@microsoft.com>
Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
---
 deepspeed/runtime/domino/transformer.py | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/deepspeed/runtime/domino/transformer.py b/deepspeed/runtime/domino/transformer.py
index 8eb95e49c29d..88c5494c8147 100644
--- a/deepspeed/runtime/domino/transformer.py
+++ b/deepspeed/runtime/domino/transformer.py
@@ -6,8 +6,7 @@
 import torch
 import torch.nn.functional as F
 from torch.nn.parameter import Parameter
-import deepspeed
-from deepspeed import comm as dist
+import deepspeed.comm as dist
 from deepspeed.accelerator import get_accelerator
 
 
@@ -97,7 +96,7 @@ def backward(ctx, grad_output):
             return grad_output
 
         # Async All-reduce.
-        handle = deepspeed.comm.all_reduce(grad_output, group=ctx.mpu.get_tensor_model_parallel_group(), async_op=True)
+        handle = dist.all_reduce(grad_output, group=ctx.mpu.get_tensor_model_parallel_group(), async_op=True)
         ctx.handle_dic[ctx.h_id] = handle
         return None, grad_output, None, None
 
@@ -249,6 +248,10 @@ def __init__(self,
                  output_bias=None):
         super(DominoTransformerLayer, self).__init__()
 
+        if not dist.is_initialized():
+            dist.init_distributed()
+            assert dist.is_initialized(), "deepspeed.comm is not initialized!"
+
         self.llama_model = config.llama_model
         self.layer_number = layer_number
         self.layer_type = layer_type
@@ -358,18 +361,14 @@ def forward(self, hidden_states, attention_mask, rotary_pos_emb=None):
                 layernorm_output0,
                 attention_mask,
                 rotary_pos_emb=rotary_pos_emb)
-        handle0 = deepspeed.comm.all_reduce(attention_output0,
-                                            group=self.mpu.get_tensor_model_parallel_group(),
-                                            async_op=True)
+        handle0 = dist.all_reduce(attention_output0, group=self.mpu.get_tensor_model_parallel_group(), async_op=True)
 
         attention_output1, attention_bias1 = \
             self.self_attention(
             layernorm_output1,
             attention_mask,
             rotary_pos_emb=rotary_pos_emb)
-        handle1 = deepspeed.comm.all_reduce(attention_output1,
-                                            group=self.mpu.get_tensor_model_parallel_group(),
-                                            async_op=True)
+        handle1 = dist.all_reduce(attention_output1, group=self.mpu.get_tensor_model_parallel_group(), async_op=True)
         handle0.wait()
 
         # Residual0 connection.
@@ -413,7 +412,7 @@ def forward(self, hidden_states, attention_mask, rotary_pos_emb=None):
             output0 = output0 + bias_c
         output0 = self.mlp_activation_func(output0)
         output0 = torch.matmul(output0, self.weight_r.t())
-        handle2 = deepspeed.comm.all_reduce(output0, group=self.mpu.get_tensor_model_parallel_group(), async_op=True)
+        handle2 = dist.all_reduce(output0, group=self.mpu.get_tensor_model_parallel_group(), async_op=True)
 
         handle1.wait()
 
@@ -425,7 +424,7 @@ def forward(self, hidden_states, attention_mask, rotary_pos_emb=None):
         if bias_c is not None:
             output1 = output1 + bias_c
         output1 = torch.matmul(output1, self.weight_r.t())
-        deepspeed.comm.all_reduce(output1, group=self.mpu.get_tensor_model_parallel_group())
+        dist.all_reduce(output1, group=self.mpu.get_tensor_model_parallel_group())
 
         handle2.wait()
 

From cc03c76d57f41752d8cfb84c2e45b8e0da8083da Mon Sep 17 00:00:00 2001
From: Raza Sikander <srsikander@habana.ai>
Date: Fri, 27 Dec 2024 01:37:28 +0530
Subject: [PATCH 2/3] Update Gaudi2 jobs to latest 1.19 build (#6905)

Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
---
 .github/workflows/hpu-gaudi2-nightly.yml | 2 +-
 .github/workflows/hpu-gaudi2.yml         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/hpu-gaudi2-nightly.yml b/.github/workflows/hpu-gaudi2-nightly.yml
index 5c5caff1ebb0..c0576360cd61 100644
--- a/.github/workflows/hpu-gaudi2-nightly.yml
+++ b/.github/workflows/hpu-gaudi2-nightly.yml
@@ -21,7 +21,7 @@ jobs:
     # The type of runner that the job will run on
     runs-on: [self-hosted, intel, gaudi2]
     container:
-      image: vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+      image: vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
       ports:
         - 80
       options: --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice
diff --git a/.github/workflows/hpu-gaudi2.yml b/.github/workflows/hpu-gaudi2.yml
index a06f871b7c56..b8b6f3cb5502 100644
--- a/.github/workflows/hpu-gaudi2.yml
+++ b/.github/workflows/hpu-gaudi2.yml
@@ -39,7 +39,7 @@ jobs:
     # The type of runner that the job will run on
     runs-on: [self-hosted, intel, gaudi2]
     container:
-      image: vault.habana.ai/gaudi-docker/1.18.0/ubuntu22.04/habanalabs/pytorch-installer-2.4.0:latest
+      image: vault.habana.ai/gaudi-docker/1.19.0/ubuntu22.04/habanalabs/pytorch-installer-2.5.1:latest
       ports:
         - 80
       options: --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice

From 3573858e7ce2c723b8c43231c6c6b0cf97dca2fc Mon Sep 17 00:00:00 2001
From: Nir Sonnenschein <nsonnenschein@habana.ai>
Date: Mon, 30 Dec 2024 20:53:41 +0200
Subject: [PATCH 3/3] Change compile for pipeline module torch.compile (#6478)

We have encountered and issue with torch.compile and the pipeline
module.
modifying a member of the module (micro_offset) during the forward
function will cause torch compile to restart the analysis and treat the
module as dynamic.
In order to bypass this issue without significantly changing the way the
pipeline module works we propose to compile only the layers in the
pipeline module instead of the forward function of pipeline module. this
will bypass the issue and should still give most of the benefit of torch
compiling the pipeline module while avoiding the issue.

---------

Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
---
 deepspeed/runtime/pipe/module.py    | 8 ++++++++
 tests/unit/pipe/test_pipe_module.py | 8 ++++++--
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/deepspeed/runtime/pipe/module.py b/deepspeed/runtime/pipe/module.py
index 31fec30be788..9fbd91f750a9 100644
--- a/deepspeed/runtime/pipe/module.py
+++ b/deepspeed/runtime/pipe/module.py
@@ -662,3 +662,11 @@ def get_additional_losses(self):
          Return a dictionary of {"loss name": loss_value} or None if no additional losses.
         """
         return None
+
+    def compile(self, *args, **kwargs):
+        for idx, layer in enumerate(self.forward_funcs):
+            if isinstance(layer, nn.Module):
+                layer.compile(*args, **kwargs)
+            else:
+                new_layer = torch.compile(layer, *args, **kwargs)
+                self.forward_funcs[idx] = new_layer
diff --git a/tests/unit/pipe/test_pipe_module.py b/tests/unit/pipe/test_pipe_module.py
index 05c6a82ef55a..2a8a4b9b7d82 100644
--- a/tests/unit/pipe/test_pipe_module.py
+++ b/tests/unit/pipe/test_pipe_module.py
@@ -60,9 +60,12 @@ def batch_input():
 
 class TestPipeModuleSequential(DistributedTest):
     world_size = 2
+    # needs to be set for torch.compile: running torch.compile with daemonic process causes an error
+    non_daemonic_procs = True
 
     @pytest.mark.parametrize("activation_checkpoints", [False, True])
-    def test(self, sequential_model, simple_config, batch_input, activation_checkpoints):
+    @pytest.mark.parametrize("use_compile", [False, True])
+    def test(self, sequential_model, simple_config, batch_input, activation_checkpoints, use_compile):
         base_model = copy.deepcopy(sequential_model)
         base_input = batch_input.clone().detach()
         base_output = base_model(base_input)
@@ -71,7 +74,8 @@ def test(self, sequential_model, simple_config, batch_input, activation_checkpoi
 
         pipe_model = copy.deepcopy(sequential_model)
         pipe_model = PipelineModule(layers=pipe_model, num_stages=2)
-
+        if (use_compile):
+            pipe_model.compile()
         # Ensure all parameters are accounted for.
         my_params = sum(p.numel() for p in pipe_model.parameters())
         total_pipe_params = torch.LongTensor([my_params]).to(get_accelerator().device_name())