From a20c79ca9520ba212aa9947c0d9c0eff0a236880 Mon Sep 17 00:00:00 2001
From: hughpu <hughpu@hotmail.com>
Date: Mon, 7 Aug 2023 19:20:15 +0800
Subject: [PATCH 01/13] feat: add `non_reentrant_checkpoint`

---
 .../activation_checkpointing/checkpointing.py | 214 ++++++++++++++++++
 1 file changed, 214 insertions(+)

diff --git a/deepspeed/runtime/activation_checkpointing/checkpointing.py b/deepspeed/runtime/activation_checkpointing/checkpointing.py
index 4e3b655cda27..3677e7706e9e 100644
--- a/deepspeed/runtime/activation_checkpointing/checkpointing.py
+++ b/deepspeed/runtime/activation_checkpointing/checkpointing.py
@@ -18,6 +18,7 @@
 import torch
 import contextlib
 from deepspeed import comm as dist
+import weakref
 
 import mmap
 from torch import _C
@@ -705,6 +706,219 @@ def backward(ctx, *grads):
         return tuple(ret_list)
 
 
+def non_reentrant_checkpoint(function, *args):
+    global mpu, timers, SYNCHRONIZE, PROFILE_TIME
+
+    deepspeed_saved_tensors = None
+    non_tensor_args = None
+    tensor_flags = None
+    def save_args_for_backward(*all_args):
+        nonlocal deepspeed_saved_tensors, non_tensor_args, tensor_flags
+        tensor_args, non_tensor_args, tensor_flags = extract_tensors(all_objects=all_args)
+        deepspeed_saved_tensors = tensor_args
+        non_tensor_args = non_tensor_args
+        tensor_flags = tensor_flags
+
+    if SYNCHRONIZE:
+        get_accelerator().synchronize()
+
+    if timers is None and PROFILE_TIME:
+        timers = Timers()
+
+    if PROFILE_TIME:
+        timers(FORWARD_GLOBAL_TIMER).start()
+
+    global num_layers
+    global mp_rank, mp_size, mp_group
+    global contiguous_data_buffers, contiguous_size_buffers
+    global data_offsets, size_offsets
+    if mp_rank is None:
+        if mpu is not None:
+            if hasattr(mpu, 'get_tensor_model_parallel_rank'):
+                mp_rank = mpu.get_tensor_model_parallel_rank()
+                mp_size = mpu.get_tensor_model_parallel_world_size()
+                mp_group = mpu.get_tensor_model_parallel_group()
+            else:
+                mp_rank = mpu.get_model_parallel_rank()
+                mp_size = mpu.get_model_parallel_world_size()
+                mp_group = mpu.get_model_parallel_group()
+        else:
+            mp_rank = 0
+            mp_size = 1
+            mp_group = None
+
+    global cuda_device, transport_stream, PARTITION_ACTIVATIONS, buffer_0, buffer_1, buffer_0_offset, buffer_1_offset
+
+    if cuda_device is None:
+        see_memory_usage("First Forward Beginning", force=False)
+        if dist.get_rank() == 0:
+            logger.info(f"Activation Checkpointing Information")
+            logger.info(f"----Partition Activations {PARTITION_ACTIVATIONS}, CPU CHECKPOINTING {CPU_CHECKPOINT}")
+            logger.info(
+                f"----contiguous Memory Checkpointing {CONTIGUOUS_CHECKPOINTING} with {num_layers} total layers")
+            logger.info(f"----Synchronization {SYNCHRONIZE}")
+            logger.info(f"----Profiling time in checkpointing {PROFILE_TIME}")
+
+        cuda_device = get_accelerator().current_device_name()
+        transport_stream = get_accelerator().Stream(device=cuda_device)
+
+    if PARTITION_ACTIVATIONS:
+        inputs = partition_activations(args, CPU_CHECKPOINT, CONTIGUOUS_CHECKPOINTING)
+    elif CPU_CHECKPOINT:
+        inputs = copy_to_device(args, device=torch.device('cpu'), criterion_func=is_activation_to_checkpoint)
+
+    # just in case something funky is happening such as reuse of inputs
+    inputs_cuda = copy_to_device(args, device=cuda_device, criterion_func=is_activation_to_checkpoint)
+
+    # Copy the rng states.
+    fwd_cpu_rng_state = torch.get_rng_state()
+    fwd_cuda_rng_state = get_accelerator().get_rng_state()
+    fwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states()
+
+    see_memory_usage("Before running forward on the layer", force=False)
+
+    if PARTITION_ACTIVATIONS:
+        new_args = get_partitioned_activations_for_backward(args, inputs, CONTIGUOUS_CHECKPOINTING)
+        assert len(new_args) % 2 == 0, f'save_for_backward called with odd number of args, {len(new_args)}'
+        save_args_for_backward(*new_args)
+    elif CPU_CHECKPOINT:
+        new_args = get_cpu_activations_for_backward(args, inputs)
+        save_args_for_backward(*new_args)
+    else:
+        save_args_for_backward(*args)
+    
+    class Holder():
+        pass
+
+    storage: weakref.WeakKeyDictionary = weakref.WeakKeyDictionary()
+    weak_holder_list = []
+    leaf_tensors = []
+
+    def checkpoint_pack(tensor_from_forward):
+        res = Holder()
+        weak_holder_list.append(weakref.ref(res))
+        return res
+    
+    def checkpoint_unpack(holder_from_backward):
+        if len(storage) == 0:
+            unpack_counter = 0
+
+            def replay_pack(tensor_from_replay):
+                nonlocal unpack_counter
+                unpack_counter += 1
+                
+                if weak_holder_list[unpack_counter - 1]() is None:
+                    return
+                
+                detached_activations = tensor_from_replay.detach()
+                storage[weak_holder_list[unpack_counter - 1]()] = detached_activations
+                if tensor_from_replay.requires_grad and tensor_from_replay.is_leaf:
+                    leaf_tensors.append(tensor_from_replay)
+                
+                return
+            
+            def replay_unpack(none_value):
+                raise RuntimeError("You are calling backwards on a tensor that is never exposed.")
+            
+            global timers
+            see_memory_usage("In backward", force=False)
+            # removing pointers to the contiguous buffer memory
+            # so that they can be garbage collected once the checkpoints
+            # have been used
+            if SYNCHRONIZE:
+                get_accelerator().synchronize()
+            if PROFILE_TIME:
+                timers('backward').start()
+
+            if CONTIGUOUS_CHECKPOINTING:
+                global data_offsets, size_offsets
+                global contiguous_data_buffers, contiguous_size_buffers
+
+                for buffers in contiguous_data_buffers:
+                    buffers = []
+
+                # frees up all the pointers to the checkpoints except for the ones
+                # stored by save for backward
+                contiguous_data_buffers = []
+                contiguous_size_buffers = []
+                data_offsets = []
+                size_offsets = []
+
+            see_memory_usage("In backward checkpointing code", force=False)
+            if not torch.autograd._is_checkpoint_valid():
+                raise RuntimeError("Checkpointing is not compatible with .grad(), "
+                                   "please use .backward() if possible")
+
+            global cuda_device, transport_stream, PARTITION_ACTIVATIONS
+
+            if PARTITION_ACTIVATIONS:
+                # with get_accelerator().stream(transport_stream):
+                inputs = gather_partitioned_activations(deepspeed_saved_tensors,
+                                                        device=cuda_device if CPU_CHECKPOINT else None)
+                detached_inputs = detach_variable(inputs)
+            elif CPU_CHECKPOINT:
+                inputs = move_to_device(deepspeed_saved_tensors, cuda_device, is_activation_to_checkpoint)
+                detached_inputs = detach_variable(inputs)
+            else:
+                inputs = deepspeed_saved_tensors
+                detached_inputs = detach_variable(inputs)
+
+            # Add non tensor input args
+            detached_inputs = merge_tensors(tensor_objects=detached_inputs,
+                                            non_tensor_objects=non_tensor_args,
+                                            tensor_flags=tensor_flags)
+
+            # Store the current states.
+            bwd_cpu_rng_state = torch.get_rng_state()
+            bwd_cuda_rng_state = get_accelerator().get_rng_state()
+            bwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states()
+
+            # Set the states to what it used to be before the forward pass.
+            torch.set_rng_state(fwd_cpu_rng_state)
+            _set_cuda_rng_state(fwd_cuda_rng_state)
+            get_cuda_rng_tracker().set_states(fwd_cuda_rng_state_tracker)
+
+            see_memory_usage("In backward checkpointing code before forward", force=False)
+            with torch.enable_grad(), torch.autograd.graph.saved_tensors_hooks(replay_pack, replay_unpack):
+                _unused = function(*detached_inputs)
+
+            see_memory_usage("In backward checkpointing code after forward", force=False)
+            # Set the states back to what it was at the start of this function.
+            torch.set_rng_state(bwd_cpu_rng_state)
+            _set_cuda_rng_state(bwd_cuda_rng_state)
+            get_cuda_rng_tracker().set_states(bwd_cuda_rng_state_tracker)
+            
+            deepspeed_saved_tensors = None
+            non_tensor_args = None
+            tensor_flags = None
+
+
+        if holder_from_backward not in storage:
+            raise RuntimeError(
+                "Attempt to retrieve a tensor saved by autograd multiple times without checkpoint"
+                " recomputation being triggered in between, this is not currently supported."
+            )
+        
+        return storage[holder_from_backward]
+
+    def after_backward_hook(_nonuse_grads):
+        see_memory_usage("After backward checkpointing code after backward", force=False)
+
+        if PROFILE_TIME:
+            timers('backward').stop()
+            timers.log(['backward'])
+        if SYNCHRONIZE:
+            get_accelerator().synchronize()
+    
+    torch.autograd.graph.register_multi_grad_hook(leaf_tensors, after_backward_hook)
+    
+
+    with torch.autograd.graph.saved_tensors_hooks(checkpoint_pack, checkpoint_unpack):
+        output = function(*inputs_cuda)
+    
+    return output
+
+
 def checkpoint(function, *args):
     """Checkpoint a model or part of the model.
     This has been directly copied from torch.utils.checkpoint. """

From 8aeba5f618833809807de0b2a2156774261df309 Mon Sep 17 00:00:00 2001
From: purk <hughpu@hotmail.com>
Date: Mon, 7 Aug 2023 22:57:11 +0800
Subject: [PATCH 02/13] feat: add missing output postprocess and change the
 hook to record leaf forward tensor refs

---
 .../activation_checkpointing/checkpointing.py | 28 ++++++++++++++-----
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/deepspeed/runtime/activation_checkpointing/checkpointing.py b/deepspeed/runtime/activation_checkpointing/checkpointing.py
index 3677e7706e9e..496c7a4bf339 100644
--- a/deepspeed/runtime/activation_checkpointing/checkpointing.py
+++ b/deepspeed/runtime/activation_checkpointing/checkpointing.py
@@ -775,8 +775,6 @@ def save_args_for_backward(*all_args):
     fwd_cuda_rng_state = get_accelerator().get_rng_state()
     fwd_cuda_rng_state_tracker = get_cuda_rng_tracker().get_states()
 
-    see_memory_usage("Before running forward on the layer", force=False)
-
     if PARTITION_ACTIVATIONS:
         new_args = get_partitioned_activations_for_backward(args, inputs, CONTIGUOUS_CHECKPOINTING)
         assert len(new_args) % 2 == 0, f'save_for_backward called with odd number of args, {len(new_args)}'
@@ -797,6 +795,8 @@ class Holder():
     def checkpoint_pack(tensor_from_forward):
         res = Holder()
         weak_holder_list.append(weakref.ref(res))
+        if tensor_from_forward.requires_grad and tensor_from_forward.is_leaf:
+            leaf_tensors.append(tensor_from_forward)
         return res
     
     def checkpoint_unpack(holder_from_backward):
@@ -812,8 +812,6 @@ def replay_pack(tensor_from_replay):
                 
                 detached_activations = tensor_from_replay.detach()
                 storage[weak_holder_list[unpack_counter - 1]()] = detached_activations
-                if tensor_from_replay.requires_grad and tensor_from_replay.is_leaf:
-                    leaf_tensors.append(tensor_from_replay)
                 
                 return
             
@@ -911,12 +909,28 @@ def after_backward_hook(_nonuse_grads):
             get_accelerator().synchronize()
     
     torch.autograd.graph.register_multi_grad_hook(leaf_tensors, after_backward_hook)
-    
 
     with torch.autograd.graph.saved_tensors_hooks(checkpoint_pack, checkpoint_unpack):
-        output = function(*inputs_cuda)
+        outputs = function(*inputs_cuda)
+
+    see_memory_usage("After running forward on the layer", force=False)
+
+    if PROFILE_TIME:
+        timers(FORWARD_GLOBAL_TIMER).stop()
+        timers.log([FORWARD_GLOBAL_TIMER])
+    if SYNCHRONIZE:
+        get_accelerator().synchronize()
+
+    all_outputs = []
+    if torch.is_tensor(outputs):
+        all_outputs += [outputs]
+    else:
+        all_outputs += outputs
     
-    return output
+    if len(all_outputs) == 1:
+        return all_outputs[0]
+    else:
+        return tuple(all_outputs)
 
 
 def checkpoint(function, *args):

From ee04fa8f7e2883f823530e9be8f0b16eb351400f Mon Sep 17 00:00:00 2001
From: purk <hughpu@hotmail.com>
Date: Mon, 7 Aug 2023 23:01:38 +0800
Subject: [PATCH 03/13] fix: make the multi_grad_hook registered after graph
 construction

---
 deepspeed/runtime/activation_checkpointing/checkpointing.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/deepspeed/runtime/activation_checkpointing/checkpointing.py b/deepspeed/runtime/activation_checkpointing/checkpointing.py
index 496c7a4bf339..59c867820fe4 100644
--- a/deepspeed/runtime/activation_checkpointing/checkpointing.py
+++ b/deepspeed/runtime/activation_checkpointing/checkpointing.py
@@ -908,10 +908,10 @@ def after_backward_hook(_nonuse_grads):
         if SYNCHRONIZE:
             get_accelerator().synchronize()
     
-    torch.autograd.graph.register_multi_grad_hook(leaf_tensors, after_backward_hook)
 
     with torch.autograd.graph.saved_tensors_hooks(checkpoint_pack, checkpoint_unpack):
         outputs = function(*inputs_cuda)
+        torch.autograd.graph.register_multi_grad_hook(leaf_tensors, after_backward_hook)
 
     see_memory_usage("After running forward on the layer", force=False)
 

From 51f833d41379d580bd26370649e4b825fbdb63c5 Mon Sep 17 00:00:00 2001
From: hughpu <hughpu@hotmail.com>
Date: Tue, 8 Aug 2023 13:29:07 +0800
Subject: [PATCH 04/13] fix: backward compatibility for multi_tensor_hook

---
 .../activation_checkpointing/checkpointing.py | 20 ++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/deepspeed/runtime/activation_checkpointing/checkpointing.py b/deepspeed/runtime/activation_checkpointing/checkpointing.py
index 59c867820fe4..d76f3aeedc32 100644
--- a/deepspeed/runtime/activation_checkpointing/checkpointing.py
+++ b/deepspeed/runtime/activation_checkpointing/checkpointing.py
@@ -791,6 +791,7 @@ class Holder():
     storage: weakref.WeakKeyDictionary = weakref.WeakKeyDictionary()
     weak_holder_list = []
     leaf_tensors = []
+    backward_visited_leaf_nodes = 0
 
     def checkpoint_pack(tensor_from_forward):
         res = Holder()
@@ -900,18 +901,23 @@ def replay_unpack(none_value):
         return storage[holder_from_backward]
 
     def after_backward_hook(_nonuse_grads):
-        see_memory_usage("After backward checkpointing code after backward", force=False)
+        nonlocal leaf_tensors, backward_visited_leaf_nodes
+        backward_visited_leaf_nodes += 1
+        
+        if backward_visited_leaf_nodes == len(leaf_tensors):
+            see_memory_usage("After backward checkpointing code after backward", force=False)
 
-        if PROFILE_TIME:
-            timers('backward').stop()
-            timers.log(['backward'])
-        if SYNCHRONIZE:
-            get_accelerator().synchronize()
+            if PROFILE_TIME:
+                timers('backward').stop()
+                timers.log(['backward'])
+            if SYNCHRONIZE:
+                get_accelerator().synchronize()
     
 
     with torch.autograd.graph.saved_tensors_hooks(checkpoint_pack, checkpoint_unpack):
         outputs = function(*inputs_cuda)
-        torch.autograd.graph.register_multi_grad_hook(leaf_tensors, after_backward_hook)
+        for leaf_tensor in leaf_tensors:
+            leaf_tensor.register_hook(after_backward_hook)
 
     see_memory_usage("After running forward on the layer", force=False)
 

From b29c1efa20862cdb3e74e7566ef7378ce2749031 Mon Sep 17 00:00:00 2001
From: hughpu <hughpu@hotmail.com>
Date: Tue, 8 Aug 2023 13:51:00 +0800
Subject: [PATCH 05/13] fix: nonlocal reference error of
 deepspeed_saved_tensors

---
 deepspeed/runtime/activation_checkpointing/checkpointing.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/deepspeed/runtime/activation_checkpointing/checkpointing.py b/deepspeed/runtime/activation_checkpointing/checkpointing.py
index d76f3aeedc32..fb901ad6c4a9 100644
--- a/deepspeed/runtime/activation_checkpointing/checkpointing.py
+++ b/deepspeed/runtime/activation_checkpointing/checkpointing.py
@@ -801,6 +801,7 @@ def checkpoint_pack(tensor_from_forward):
         return res
     
     def checkpoint_unpack(holder_from_backward):
+        nonlocal deepspeed_saved_tensors, non_tensor_args, tensor_flags
         if len(storage) == 0:
             unpack_counter = 0
 

From 37e7c2349afabf2e6afd22e48e810262a22e8504 Mon Sep 17 00:00:00 2001
From: hughpu <hughpu@hotmail.com>
Date: Tue, 8 Aug 2023 19:35:37 +0800
Subject: [PATCH 06/13] fix: reduce repeating hook registration

---
 deepspeed/runtime/activation_checkpointing/checkpointing.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/deepspeed/runtime/activation_checkpointing/checkpointing.py b/deepspeed/runtime/activation_checkpointing/checkpointing.py
index fb901ad6c4a9..f4990e5c592d 100644
--- a/deepspeed/runtime/activation_checkpointing/checkpointing.py
+++ b/deepspeed/runtime/activation_checkpointing/checkpointing.py
@@ -917,8 +917,8 @@ def after_backward_hook(_nonuse_grads):
 
     with torch.autograd.graph.saved_tensors_hooks(checkpoint_pack, checkpoint_unpack):
         outputs = function(*inputs_cuda)
-        for leaf_tensor in leaf_tensors:
-            leaf_tensor.register_hook(after_backward_hook)
+    for leaf_tensor in leaf_tensors:
+        leaf_tensor.register_hook(after_backward_hook)
 
     see_memory_usage("After running forward on the layer", force=False)
 

From e22c48772bba7dbc353986022389a1a428e56cbb Mon Sep 17 00:00:00 2001
From: hughpu <hughpu@hotmail.com>
Date: Wed, 9 Aug 2023 16:08:18 +0800
Subject: [PATCH 07/13] test: add test for
 `activation_checkpointing.checkpointing.non_reentrant_checkpoint`

---
 ..._activation_checkpointing_non_reentrant.py | 83 +++++++++++++++++++
 1 file changed, 83 insertions(+)
 create mode 100644 tests/unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py

diff --git a/tests/unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py b/tests/unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py
new file mode 100644
index 000000000000..378ae9d4b15c
--- /dev/null
+++ b/tests/unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py
@@ -0,0 +1,83 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# TODO: add tests with model parallelism for activation partitioning and other features.
+
+from deepspeed.runtime.activation_checkpointing.checkpointing import non_reentrant_checkpoint
+import test_activation_checkpointing
+from test_activation_checkpointing import *
+from test_activation_checkpointing import (
+    _bool_to_float, _compute, _match_outputs, _mixed_mask,
+    _prep_inputs, _test_activation_checkpoint, _test_activation_checkpoint_ordering
+)
+
+ckpt = non_reentrant_checkpoint
+
+# both bool and float are important, as bool is not differentiable
+@pytest.mark.parametrize('mask', [
+    _mixed_mask(),
+    _bool_to_float(_mixed_mask()),
+])
+class TestActivationCheckpointWithoutGrad(DistributedTest):
+    world_size = 1
+
+    def test_ckpt_inputs1_outputs1(self, mask):
+        module = torch.nn.Linear(HIDDEN_DIM, HIDDEN_DIM)
+        inputs = torch.rand(HIDDEN_DIM)
+        _test_activation_checkpoint(module, inputs)
+
+    def test_ckpt_inputs2_outputs1(self, mask):
+        module = MaskedLinear(HIDDEN_DIM, HIDDEN_DIM)
+        inputs = torch.rand(HIDDEN_DIM)
+        _test_activation_checkpoint(module, inputs, mask)
+
+    def test_ckpt_inputs2_outputs2(self, mask):
+        module = MaskedLinearSeq(HIDDEN_DIM, HIDDEN_DIM)
+        inputs = torch.rand(HIDDEN_DIM)
+        _test_activation_checkpoint(module, inputs, mask)
+
+    def test_ckpt_inputs2_outputs3(self, mask):
+        module = MaskedLinearSeqDup(HIDDEN_DIM, HIDDEN_DIM)
+        inputs = torch.rand(HIDDEN_DIM)
+        _test_activation_checkpoint(module, inputs, mask)
+
+    def test_ckpt_arg_none(self, mask):
+        module = DropMaskLinear(HIDDEN_DIM, HIDDEN_DIM)
+        inputs = (torch.rand(HIDDEN_DIM), None)
+        _test_activation_checkpoint(module, *inputs)
+
+
+@pytest.mark.parametrize('non_tensor', [None, 2, True, (None, 2.5), (None, True, torch.randn(HIDDEN_DIM))])
+class TestCheckpointNonTensorWithoutGrad(DistributedTest):
+    world_size = 1
+
+    def test_ckpt_non_tensor_input(self, non_tensor):
+        module = LinearNonTensorInput(HIDDEN_DIM, HIDDEN_DIM)
+        inputs = torch.rand(HIDDEN_DIM)
+        _test_activation_checkpoint(module, inputs, non_tensor)
+
+    def test_ckpt_non_tensor_output(self, non_tensor):
+        module = LinearNonTensorOutput(non_tensor)
+        inputs = torch.rand(HIDDEN_DIM)
+        _test_activation_checkpoint(module, inputs)
+
+
+@pytest.mark.parametrize('non_tensor_output', [
+    None, (torch.randn(HIDDEN_DIM), 2.5), (None, torch.randn(HIDDEN_DIM), True), (None, True, torch.randn(HIDDEN_DIM))
+])
+class TestCheckpointNonTensorOutputOrderingWithoutGrad(DistributedTest):
+    world_size = 1
+
+    def test_ckpt_non_tensor_output_ordering(self, non_tensor_output):
+        module = LinearNonTensorOutput(non_tensor_output)
+        inputs = torch.rand(HIDDEN_DIM)
+
+        # First return is a tensor
+        ordering = [True]
+        if type(non_tensor_output) in [list, tuple]:
+            ordering += [torch.is_tensor(t) for t in non_tensor_output]
+        else:
+            ordering += [torch.is_tensor(non_tensor_output)]
+        _test_activation_checkpoint_ordering(module, ordering, inputs)
\ No newline at end of file

From 4d2a274b9991a83d5c176f0a809b78a3863c755c Mon Sep 17 00:00:00 2001
From: Connor Holmes <connorholmes@microsoft.com>
Date: Tue, 8 Aug 2023 18:55:44 -0700
Subject: [PATCH 08/13] Pass correct node size for ZeRO++ (#4085)

* Pass correct node size

* formatting

---------

Co-authored-by: Connor Holmes <development@cmikeh2.me>
Co-authored-by: Michael Wyatt <michaelwyatt@microsoft.com>
---
 csrc/quantization/pt_binding.cpp                | 14 +++++++-------
 deepspeed/runtime/comm/coalesced_collectives.py |  3 ++-
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/csrc/quantization/pt_binding.cpp b/csrc/quantization/pt_binding.cpp
index 2bc9f89bbee9..66cfce708a95 100644
--- a/csrc/quantization/pt_binding.cpp
+++ b/csrc/quantization/pt_binding.cpp
@@ -184,7 +184,8 @@ std::vector<at::Tensor> quantized_reduction(at::Tensor& input_vals,
                                             int in_groups,
                                             int out_groups,
                                             int num_bits,
-                                            quantize::Type quant_type)
+                                            quantize::Type quant_type,
+                                            int devices_per_node)
 {
     auto scales_options = at::TensorOptions()
                               .dtype(at::kFloat)
@@ -201,25 +202,24 @@ std::vector<at::Tensor> quantized_reduction(at::Tensor& input_vals,
                               .requires_grad(false);
 
     std::vector<long int> sz(input_vals.sizes().begin(), input_vals.sizes().end());
-    const int gpu_per_node = 16;                   // depend on machine in_groups/out_groups;
-    sz[sz.size() - 1] = sz.back() / gpu_per_node;  // num of GPU per nodes
-    const int elems_per_in_tensor = at::numel(input_vals) / gpu_per_node;
+    sz[sz.size() - 1] = sz.back() / devices_per_node;  // num of GPU per nodes
+    const int elems_per_in_tensor = at::numel(input_vals) / devices_per_node;
     auto output = torch::empty(sz, output_options);
 
-    const int elems_per_in_group = elems_per_in_tensor / (in_groups / gpu_per_node);
+    const int elems_per_in_group = elems_per_in_tensor / (in_groups / devices_per_node);
     const int elems_per_out_group = elems_per_in_tensor / out_groups;
 
     launch_dequant_reduce((int8_t*)output.data_ptr(),
                           (float*)scales.data_ptr(),
                           (const int8_t*)input_vals.data_ptr(),
                           (const float*)input_scales.data_ptr(),
-                          gpu_per_node,
+                          devices_per_node,
                           num_bits,
                           quant_type,
                           out_groups,
                           elems_per_out_group,
                           elems_per_in_tensor,
-                          in_groups / gpu_per_node,
+                          in_groups / devices_per_node,
                           elems_per_in_group,
                           at::cuda::getCurrentCUDAStream());
     return {output, scales};
diff --git a/deepspeed/runtime/comm/coalesced_collectives.py b/deepspeed/runtime/comm/coalesced_collectives.py
index 0e23476064bd..b8134b453e39 100644
--- a/deepspeed/runtime/comm/coalesced_collectives.py
+++ b/deepspeed/runtime/comm/coalesced_collectives.py
@@ -56,7 +56,8 @@ def all_to_all_quant_reduce(tensors: List[Tensor], groups: {}) -> List[Tensor]:
             all_to_all_single(local_output, intra_quant_int4, group=groups[f'local_{intra_idx}'])
             all_to_all_single(scale_output, intra_q_scales, group=groups[f'local_{intra_idx}'])
             global_input_tensor, global_scales = quantizer_module.quantized_reduction(
-                local_output, scale_output, intra_quant_group, inter_quant_group, 4, quantizer_module.Symmetric)
+                local_output, scale_output, intra_quant_group, inter_quant_group, 4, quantizer_module.Symmetric,
+                local_world_size)
             global_output = torch.empty_like(global_input_tensor)
             global_scale_output = torch.empty_like(global_scales)
             all_to_all_single(global_output, global_input_tensor, group=groups[f'global_{inter_idx}'])

From d4d070b8612f4b92eff1a462392d6b892fbe7c1b Mon Sep 17 00:00:00 2001
From: Conglong Li <conglong.li@gmail.com>
Date: Tue, 8 Aug 2023 18:57:21 -0700
Subject: [PATCH 09/13] add deepspeed chat arxiv report (#4110)

* add deepspeed chat arxiv report

* add zeroquant v2 and fp

* add selective enhencement

* add ignore for 'Youn' in spell checker

---------

Co-authored-by: yaozhewei <zheweiy@berkeley.edu>
Co-authored-by: Michael Wyatt <michaelwyatt@microsoft.com>
---
 .pre-commit-config.yaml                 |  2 +-
 README.md                               |  4 ++++
 blogs/deepspeed-chat/README.md          | 11 +++++++++++
 blogs/deepspeed-chat/chinese/README.md  | 11 +++++++++++
 blogs/deepspeed-chat/japanese/README.md | 11 +++++++++++
 docs/index.md                           |  4 ++++
 6 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 8a6af07d12b3..36fa34a42744 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -58,7 +58,7 @@ repos:
             # Do not check files that are automatically generated
             '--skip=docs/Gemfile.lock,tests/unit/gpt2-merges.txt,tests/unit/gpt2-vocab.json',
             '--ignore-regex=\\n',  # Do not count the 'n' in an escaped newline as part of a word
-            '--ignore-words-list=unsupport',  # Word used in error messages that need rewording
+            '--ignore-words-list=youn,unsupport',  # Word used in error messages that need rewording
             --check-filenames,
             --check-hidden
         ]
diff --git a/README.md b/README.md
index 51a9845b30d5..353eccdbd2f0 100755
--- a/README.md
+++ b/README.md
@@ -226,6 +226,10 @@ Conduct](https://opensource.microsoft.com/codeofconduct/). For more information
 20. Quentin Anthony, Ammar Ahmad Awan, Jeff Rasley, Yuxiong He, Aamir Shafi, Mustafa Abduljabbar, Hari Subramoni, Dhabaleswar Panda. (2023) MCR-DL: Mix-and-Match Communication Runtime for Deep Learning [arXiv:2303.08374](https://arxiv.org/abs/2303.08374) and will appear at IPDPS 2023.
 21. Siddharth Singh, Olatunji Ruwase, Ammar Ahmad Awan, Samyam Rajbhandari, Yuxiong He, Abhinav Bhatele. (2023) A Hybrid Tensor-Expert-Data Parallelism Approach to Optimize Mixture-of-Experts Training [arXiv:2303.06318](https://arxiv.org/abs/2303.06318) and will appear at ICS 2023.
 22. Guanhua Wang, Heyang Qin, Sam Ade Jacobs, Connor Holmes, Samyam Rajbhandari, Olatunji Ruwase, Feng Yan, Lei Yang, Yuxiong He. (2023) ZeRO++: Extremely Efficient Collective Communication for Giant Model Training [arXiv:2306.10209](https://arxiv.org/abs/2306.10209).
+23. Zhewei Yao, Xiaoxia Wu, Cheng Li, Stephen Youn, Yuxiong He. (2023) ZeroQuant-V2: Exploring Post-training Quantization in LLMs from Comprehensive Study to Low Rank Compensation [arXiv:2303.08302](https://arxiv.org/abs/2303.08302)
+24. Pareesa Ameneh Golnari, Zhewei Yao, Yuxiong He. (2023) Selective Guidance: Are All the Denoising Steps of Guided Diffusion Important? [arXiv:2305.09847](https://arxiv.org/abs/2305.09847)
+25. Zhewei Yao, Reza Yazdani Aminabadi, Olatunji Ruwase, Samyam Rajbhandari, Xiaoxia Wu, Ammar Ahmad Awan, Jeff Rasley, Minjia Zhang, Conglong Li, Connor Holmes, Zhongzhu Zhou, Michael Wyatt, Molly Smith, Lev Kurilenko, Heyang Qin, Masahiro Tanaka, Shuai Che, Shuaiwen Leon Song, Yuxiong He. (2023) DeepSpeed-Chat: Easy, Fast and Affordable RLHF Training of ChatGPT-like Models at All Scales [arXiv:2308.01320](https://arxiv.org/abs/2308.01320).
+26. Xiaoxia Wu, Zhewei Yao, Yuxiong He. (2023) ZeroQuant-FP: A Leap Forward in LLMs Post-Training W4A8 Quantization Using Floating-Point Formats [arXiv:2307.09782](https://arxiv.org/abs/2307.09782)
 
 
 # Videos
diff --git a/blogs/deepspeed-chat/README.md b/blogs/deepspeed-chat/README.md
index 4d6c6de2d722..5110eef28bd5 100644
--- a/blogs/deepspeed-chat/README.md
+++ b/blogs/deepspeed-chat/README.md
@@ -10,6 +10,17 @@
 
 </div>
 
+To cite DeepSpeed Chat, please cite our [arxiv report](https://arxiv.org/abs/2308.01320):
+
+```
+@article{yao2023dschat,
+  title={{DeepSpeed-Chat: Easy, Fast and Affordable RLHF Training of ChatGPT-like Models at All Scales}},
+  author={Zhewei Yao and Reza Yazdani Aminabadi and Olatunji Ruwase and Samyam Rajbhandari and Xiaoxia Wu and Ammar Ahmad Awan and Jeff Rasley and Minjia Zhang and Conglong Li and Connor Holmes and Zhongzhu Zhou and Michael Wyatt and Molly Smith and Lev Kurilenko and Heyang Qin and Masahiro Tanaka and Shuai Che and Shuaiwen Leon Song and Yuxiong He},
+  journal={arXiv preprint arXiv:2308.01320},
+  year={2023}
+}
+```
+
 # 1. Overview
 
 ChatGPT like models have taken the AI world by storm, and it would not be an overstatement to say that it's impact on the digital world has been revolutionary. These models are incredibly versatile, capable of performing tasks like summarization, coding, and translation with results that are on-par or even exceeding the capabilities of human experts. Given the sheer power of these models, multiple efforts are underway in the AI open-source community to make ChatGPT-style models more accessible (e.g. ChatLLaMa, Alpaca, Vicuna, Databricks-Dolly, etc.).
diff --git a/blogs/deepspeed-chat/chinese/README.md b/blogs/deepspeed-chat/chinese/README.md
index 66fcf3f4a754..03bf9b69449f 100644
--- a/blogs/deepspeed-chat/chinese/README.md
+++ b/blogs/deepspeed-chat/chinese/README.md
@@ -10,6 +10,17 @@
 
 </div>
 
+如需引用 DeepSpeed Chat，请引用我们的[arxiv report](https://arxiv.org/abs/2308.01320):
+
+```
+@article{yao2023dschat,
+  title={{DeepSpeed-Chat: Easy, Fast and Affordable RLHF Training of ChatGPT-like Models at All Scales}},
+  author={Zhewei Yao and Reza Yazdani Aminabadi and Olatunji Ruwase and Samyam Rajbhandari and Xiaoxia Wu and Ammar Ahmad Awan and Jeff Rasley and Minjia Zhang and Conglong Li and Connor Holmes and Zhongzhu Zhou and Michael Wyatt and Molly Smith and Lev Kurilenko and Heyang Qin and Masahiro Tanaka and Shuai Che and Shuaiwen Leon Song and Yuxiong He},
+  journal={arXiv preprint arXiv:2308.01320},
+  year={2023}
+}
+```
+
 # 1. 概述
 
 近日来，ChatGPT及类似模型引发了人工智能（AI）领域的一场风潮。 这场风潮对数字世界产生了革命性影响。ChatGPT类模型具有惊人的泛用性，能够执行归纳、编程、翻译等任务，其结果与人类专家相当甚至更优。为了使ChatGPT等模型的训练和部署更轻松，AI 开源社区进行了各种尝试（例如 ChatLLaMa、Alpaca、Vicuna、Databricks-Dolly等）。
diff --git a/blogs/deepspeed-chat/japanese/README.md b/blogs/deepspeed-chat/japanese/README.md
index 9c7256b8f506..e7aa62721417 100644
--- a/blogs/deepspeed-chat/japanese/README.md
+++ b/blogs/deepspeed-chat/japanese/README.md
@@ -10,6 +10,17 @@
 
 </div>
 
+DeepSpeed Chat を引用するには、こちらの[arxiv report](https://arxiv.org/abs/2308.01320)を引用してください:
+
+```
+@article{yao2023dschat,
+  title={{DeepSpeed-Chat: Easy, Fast and Affordable RLHF Training of ChatGPT-like Models at All Scales}},
+  author={Zhewei Yao and Reza Yazdani Aminabadi and Olatunji Ruwase and Samyam Rajbhandari and Xiaoxia Wu and Ammar Ahmad Awan and Jeff Rasley and Minjia Zhang and Conglong Li and Connor Holmes and Zhongzhu Zhou and Michael Wyatt and Molly Smith and Lev Kurilenko and Heyang Qin and Masahiro Tanaka and Shuai Che and Shuaiwen Leon Song and Yuxiong He},
+  journal={arXiv preprint arXiv:2308.01320},
+  year={2023}
+}
+```
+
 # 1. 概要
 
 ChatGPT（チャットGPT）やその類似モデルは、AIの世界に旋風を巻き起こし、デジタル業界に革命的な影響を与えています。これらのモデルは非常に汎用性が高く、要約、コーディング、翻訳などの多様なタスクを、人間の専門家と同等か、それ以上の結果で実施できます。その圧倒的な性能を受けて、AI関連のオープンソースコミュニティでは、ChatGPTスタイルのモデルをより利用しやすくするための複数の取り組みが始まっています（ChatLLaMa、Alpaca、Vicuna、Databricks-Dollyなど）。
diff --git a/docs/index.md b/docs/index.md
index 8801ee3ca91b..aeef4df41e53 100755
--- a/docs/index.md
+++ b/docs/index.md
@@ -128,6 +128,10 @@ comments.
 20. Quentin Anthony, Ammar Ahmad Awan, Jeff Rasley, Yuxiong He, Aamir Shafi, Mustafa Abduljabbar, Hari Subramoni, Dhabaleswar Panda. (2023) MCR-DL: Mix-and-Match Communication Runtime for Deep Learning [arXiv:2303.08374](https://arxiv.org/abs/2303.08374) and will appear at IPDPS 2023.
 21. Siddharth Singh, Olatunji Ruwase, Ammar Ahmad Awan, Samyam Rajbhandari, Yuxiong He, Abhinav Bhatele. (2023) A Hybrid Tensor-Expert-Data Parallelism Approach to Optimize Mixture-of-Experts Training [arXiv:2303.06318](https://arxiv.org/abs/2303.06318) and will appear at ICS 2023.
 22. Guanhua Wang, Heyang Qin, Sam Ade Jacobs, Connor Holmes, Samyam Rajbhandari, Olatunji Ruwase, Feng Yan, Lei Yang, Yuxiong He. (2023) ZeRO++: Extremely Efficient Collective Communication for Giant Model Training [arXiv:2306.10209](https://arxiv.org/abs/2306.10209).
+23. Zhewei Yao, Xiaoxia Wu, Cheng Li, Stephen Youn, Yuxiong He. (2023) ZeroQuant-V2: Exploring Post-training Quantization in LLMs from Comprehensive Study to Low Rank Compensation [arXiv:2303.08302](https://arxiv.org/abs/2303.08302)
+24. Pareesa Ameneh Golnari, Zhewei Yao, Yuxiong He. (2023) Selective Guidance: Are All the Denoising Steps of Guided Diffusion Important? [arXiv:2305.09847](https://arxiv.org/abs/2305.09847)
+25. Zhewei Yao, Reza Yazdani Aminabadi, Olatunji Ruwase, Samyam Rajbhandari, Xiaoxia Wu, Ammar Ahmad Awan, Jeff Rasley, Minjia Zhang, Conglong Li, Connor Holmes, Zhongzhu Zhou, Michael Wyatt, Molly Smith, Lev Kurilenko, Heyang Qin, Masahiro Tanaka, Shuai Che, Shuaiwen Leon Song, Yuxiong He. (2023) DeepSpeed-Chat: Easy, Fast and Affordable RLHF Training of ChatGPT-like Models at All Scales [arXiv:2308.01320](https://arxiv.org/abs/2308.01320).
+26. Xiaoxia Wu, Zhewei Yao, Yuxiong He. (2023) ZeroQuant-FP: A Leap Forward in LLMs Post-Training W4A8 Quantization Using Floating-Point Formats [arXiv:2307.09782](https://arxiv.org/abs/2307.09782)
 
 # Videos
 1. DeepSpeed KDD 2020 Tutorial

From aaf309e6412b5bb107786a1aae1bc8d75a680233 Mon Sep 17 00:00:00 2001
From: purk <hughpu@hotmail.com>
Date: Wed, 9 Aug 2023 18:07:48 +0800
Subject: [PATCH 10/13] style: change flake8 detected style missmatch

---
 .../activation_checkpointing/checkpointing.py | 31 ++++----
 ..._activation_checkpointing_non_reentrant.py | 72 +++++++++++++++++--
 2 files changed, 79 insertions(+), 24 deletions(-)

diff --git a/deepspeed/runtime/activation_checkpointing/checkpointing.py b/deepspeed/runtime/activation_checkpointing/checkpointing.py
index f4990e5c592d..9076b8d9d14e 100644
--- a/deepspeed/runtime/activation_checkpointing/checkpointing.py
+++ b/deepspeed/runtime/activation_checkpointing/checkpointing.py
@@ -712,6 +712,7 @@ def non_reentrant_checkpoint(function, *args):
     deepspeed_saved_tensors = None
     non_tensor_args = None
     tensor_flags = None
+
     def save_args_for_backward(*all_args):
         nonlocal deepspeed_saved_tensors, non_tensor_args, tensor_flags
         tensor_args, non_tensor_args, tensor_flags = extract_tensors(all_objects=all_args)
@@ -784,7 +785,7 @@ def save_args_for_backward(*all_args):
         save_args_for_backward(*new_args)
     else:
         save_args_for_backward(*args)
-    
+
     class Holder():
         pass
 
@@ -799,7 +800,7 @@ def checkpoint_pack(tensor_from_forward):
         if tensor_from_forward.requires_grad and tensor_from_forward.is_leaf:
             leaf_tensors.append(tensor_from_forward)
         return res
-    
+
     def checkpoint_unpack(holder_from_backward):
         nonlocal deepspeed_saved_tensors, non_tensor_args, tensor_flags
         if len(storage) == 0:
@@ -808,18 +809,18 @@ def checkpoint_unpack(holder_from_backward):
             def replay_pack(tensor_from_replay):
                 nonlocal unpack_counter
                 unpack_counter += 1
-                
+
                 if weak_holder_list[unpack_counter - 1]() is None:
                     return
-                
+
                 detached_activations = tensor_from_replay.detach()
                 storage[weak_holder_list[unpack_counter - 1]()] = detached_activations
-                
+
                 return
-            
+
             def replay_unpack(none_value):
                 raise RuntimeError("You are calling backwards on a tensor that is never exposed.")
-            
+
             global timers
             see_memory_usage("In backward", force=False)
             # removing pointers to the contiguous buffer memory
@@ -887,24 +888,21 @@ def replay_unpack(none_value):
             torch.set_rng_state(bwd_cpu_rng_state)
             _set_cuda_rng_state(bwd_cuda_rng_state)
             get_cuda_rng_tracker().set_states(bwd_cuda_rng_state_tracker)
-            
+
             deepspeed_saved_tensors = None
             non_tensor_args = None
             tensor_flags = None
 
-
         if holder_from_backward not in storage:
-            raise RuntimeError(
-                "Attempt to retrieve a tensor saved by autograd multiple times without checkpoint"
-                " recomputation being triggered in between, this is not currently supported."
-            )
-        
+            raise RuntimeError("Attempt to retrieve a tensor saved by autograd multiple times without checkpoint"
+                               " recomputation being triggered in between, this is not currently supported.")
+
         return storage[holder_from_backward]
 
     def after_backward_hook(_nonuse_grads):
         nonlocal leaf_tensors, backward_visited_leaf_nodes
         backward_visited_leaf_nodes += 1
-        
+
         if backward_visited_leaf_nodes == len(leaf_tensors):
             see_memory_usage("After backward checkpointing code after backward", force=False)
 
@@ -913,7 +911,6 @@ def after_backward_hook(_nonuse_grads):
                 timers.log(['backward'])
             if SYNCHRONIZE:
                 get_accelerator().synchronize()
-    
 
     with torch.autograd.graph.saved_tensors_hooks(checkpoint_pack, checkpoint_unpack):
         outputs = function(*inputs_cuda)
@@ -933,7 +930,7 @@ def after_backward_hook(_nonuse_grads):
         all_outputs += [outputs]
     else:
         all_outputs += outputs
-    
+
     if len(all_outputs) == 1:
         return all_outputs[0]
     else:
diff --git a/tests/unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py b/tests/unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py
index 378ae9d4b15c..e516a4619ed2 100644
--- a/tests/unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py
+++ b/tests/unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py
@@ -6,14 +6,72 @@
 # TODO: add tests with model parallelism for activation partitioning and other features.
 
 from deepspeed.runtime.activation_checkpointing.checkpointing import non_reentrant_checkpoint
-import test_activation_checkpointing
 from test_activation_checkpointing import *
-from test_activation_checkpointing import (
-    _bool_to_float, _compute, _match_outputs, _mixed_mask,
-    _prep_inputs, _test_activation_checkpoint, _test_activation_checkpoint_ordering
-)
+from test_activation_checkpointing import (_mixed_mask, _bool_to_float, _prep_inputs, _match_outputs)
+
+
+def _compute(module, *inputs, do_checkpoint=False):
+    if do_checkpoint:
+        outputs = non_reentrant_checkpoint(module, *inputs)
+    else:
+        outputs = module(*inputs)
+
+    if torch.is_tensor(outputs):
+        outputs = (outputs, )
+
+    sum(o.sum() for o in outputs if torch.is_tensor(o) and o.requires_grad).backward()
+
+    grads = [p.grad for p in module.parameters()]
+    input_grads = [inp.grad for inp in inputs if torch.is_tensor(inp)]
+
+    return {
+        'outputs': outputs,
+        'module_grads': grads,
+        'input_grads': input_grads,
+    }
+
+
+def _test_activation_checkpoint(module, *inputs):
+    # Move to device
+    module.to(get_accelerator().device_name())
+
+    # Get rid of dropouts until we fork the RNG between tests.
+    module.eval()
+
+    module_ = deepcopy(module)
+    inputs_ = _prep_inputs(*inputs)
+    base = _compute(module_, *inputs_, do_checkpoint=False)
+
+    module_ = deepcopy(module)
+    inputs_ = _prep_inputs(*inputs)
+    test = _compute(module_, *inputs_, do_checkpoint=True)
+
+    for group in base.keys():
+        for b, t in zip(base[group], test[group]):
+            _match_outputs(b, t)
+
+
+def _test_activation_checkpoint_ordering(module, expected_ordering, *inputs):
+    # Move to device
+    module.to(get_accelerator().device_name())
+
+    # Get rid of dropouts until we fork the RNG between tests.
+    module.eval()
+
+    module_ = deepcopy(module)
+    inputs_ = _prep_inputs(*inputs)
+    test = _compute(module_, *inputs_, do_checkpoint=True)
+
+    outputs = test['outputs']
+    test_ordering = []
+    for item in outputs:
+        if type(item) in [list, tuple]:
+            test_ordering += [torch.is_tensor(t) for t in item]
+        else:
+            test_ordering += [torch.is_tensor(item)]
+
+    assert expected_ordering == test_ordering
 
-ckpt = non_reentrant_checkpoint
 
 # both bool and float are important, as bool is not differentiable
 @pytest.mark.parametrize('mask', [
@@ -80,4 +138,4 @@ def test_ckpt_non_tensor_output_ordering(self, non_tensor_output):
             ordering += [torch.is_tensor(t) for t in non_tensor_output]
         else:
             ordering += [torch.is_tensor(non_tensor_output)]
-        _test_activation_checkpoint_ordering(module, ordering, inputs)
\ No newline at end of file
+        _test_activation_checkpoint_ordering(module, ordering, inputs)

From a9109221ca748437b5d194456a4e3c08198c08e9 Mon Sep 17 00:00:00 2001
From: purk <hughpu@hotmail.com>
Date: Wed, 9 Aug 2023 22:06:07 +0800
Subject: [PATCH 11/13] test: hack to clone the `test_activation_checkpointing`
 module for reuse and add regression tests

---
 ..._activation_checkpointing_non_reentrant.py | 83 +++++++------------
 1 file changed, 28 insertions(+), 55 deletions(-)

diff --git a/tests/unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py b/tests/unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py
index e516a4619ed2..226170872ece 100644
--- a/tests/unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py
+++ b/tests/unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py
@@ -5,72 +5,45 @@
 
 # TODO: add tests with model parallelism for activation partitioning and other features.
 
-from deepspeed.runtime.activation_checkpointing.checkpointing import non_reentrant_checkpoint
-from test_activation_checkpointing import *
-from test_activation_checkpointing import (_mixed_mask, _bool_to_float, _prep_inputs, _match_outputs)
-
-
-def _compute(module, *inputs, do_checkpoint=False):
-    if do_checkpoint:
-        outputs = non_reentrant_checkpoint(module, *inputs)
-    else:
-        outputs = module(*inputs)
-
-    if torch.is_tensor(outputs):
-        outputs = (outputs, )
-
-    sum(o.sum() for o in outputs if torch.is_tensor(o) and o.requires_grad).backward()
+import sys
+import torch
+import pytest
+from importlib import util
 
-    grads = [p.grad for p in module.parameters()]
-    input_grads = [inp.grad for inp in inputs if torch.is_tensor(inp)]
-
-    return {
-        'outputs': outputs,
-        'module_grads': grads,
-        'input_grads': input_grads,
-    }
-
-
-def _test_activation_checkpoint(module, *inputs):
-    # Move to device
-    module.to(get_accelerator().device_name())
+from deepspeed.runtime.activation_checkpointing.checkpointing import non_reentrant_checkpoint
+from unit.common import DistributedTest
 
-    # Get rid of dropouts until we fork the RNG between tests.
-    module.eval()
+ORG_SPEC = util.find_spec('test_activation_checkpointing')
+test_act_ckpt = util.module_from_spec(ORG_SPEC)
+ORG_SPEC.loader.exec_module(test_act_ckpt)
+sys.modules['test_act_ckpt'] = test_act_ckpt
+test_act_ckpt.ckpt = non_reentrant_checkpoint
 
-    module_ = deepcopy(module)
-    inputs_ = _prep_inputs(*inputs)
-    base = _compute(module_, *inputs_, do_checkpoint=False)
+HIDDEN_DIM = test_act_ckpt.HIDDEN_DIM
 
-    module_ = deepcopy(module)
-    inputs_ = _prep_inputs(*inputs)
-    test = _compute(module_, *inputs_, do_checkpoint=True)
+MaskedLinear = test_act_ckpt.MaskedLinear
+MaskedLinearSeq = test_act_ckpt.MaskedLinearSeq
+MaskedLinearSeqDup = test_act_ckpt.MaskedLinearSeqDup
+DropMaskLinear = test_act_ckpt.DropMaskLinear
+LinearNonTensorInput = test_act_ckpt.LinearNonTensorInput
+LinearNonTensorOutput = test_act_ckpt.LinearNonTensorOutput
 
-    for group in base.keys():
-        for b, t in zip(base[group], test[group]):
-            _match_outputs(b, t)
+_test_activation_checkpoint = test_act_ckpt._test_activation_checkpoint
+_mixed_mask = test_act_ckpt._mixed_mask
+_bool_to_float = test_act_ckpt._bool_to_float
+_test_activation_checkpoint_ordering = test_act_ckpt._test_activation_checkpoint_ordering
 
 
-def _test_activation_checkpoint_ordering(module, expected_ordering, *inputs):
-    # Move to device
-    module.to(get_accelerator().device_name())
+class TestActivationCheckpointWithGrad(test_act_ckpt.TestActivationCheckpoint):
+    pass
 
-    # Get rid of dropouts until we fork the RNG between tests.
-    module.eval()
 
-    module_ = deepcopy(module)
-    inputs_ = _prep_inputs(*inputs)
-    test = _compute(module_, *inputs_, do_checkpoint=True)
+class TestCheckpointNonTensorWithGrad(test_act_ckpt.TestCheckpointNonTensor):
+    pass
 
-    outputs = test['outputs']
-    test_ordering = []
-    for item in outputs:
-        if type(item) in [list, tuple]:
-            test_ordering += [torch.is_tensor(t) for t in item]
-        else:
-            test_ordering += [torch.is_tensor(item)]
 
-    assert expected_ordering == test_ordering
+class TestCheckpointNonTensorOutputOrderingWithGrad(test_act_ckpt.TestCheckpointNonTensorOutputOrdering):
+    pass
 
 
 # both bool and float are important, as bool is not differentiable

From fc919b17bd0ca4825c41a251a55088414c2fb90f Mon Sep 17 00:00:00 2001
From: purk <hughpu@hotmail.com>
Date: Wed, 9 Aug 2023 23:13:48 +0800
Subject: [PATCH 12/13] doc: explain the introduction of
 `non_reentrant_checkpoint`

---
 .../activation_checkpointing/checkpointing.py | 34 +++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/deepspeed/runtime/activation_checkpointing/checkpointing.py b/deepspeed/runtime/activation_checkpointing/checkpointing.py
index 9076b8d9d14e..77407a52026a 100644
--- a/deepspeed/runtime/activation_checkpointing/checkpointing.py
+++ b/deepspeed/runtime/activation_checkpointing/checkpointing.py
@@ -707,6 +707,19 @@ def backward(ctx, *grads):
 
 
 def non_reentrant_checkpoint(function, *args):
+    """This function is union of `torch.utils.checkpoint._checkpoint_without_reentrant` and `CheckpointFunction` in this module
+
+    This function is aim to solve the back probagation error raised from all input requires no grad.
+    * has already been implemented in pytorch for a while, the solution is stable at most time except for jit module mode.
+    * can help to solve the issue which is hacked by `deepspeed.runtime.pipe.module.PipelineModule._is_checkpointable`
+
+    Main modifications compared to the implementation of torch:
+    1. adapt to the signature of `checkpoint` function in this module
+    2. solve the non-deterministic by random state management consistent with deepspeed `CheckpointFunction`
+    3. when there is partition or cpu checkpointing, gather them in the unpack_hook during back probagation
+    4. make all after backward blocks in the hook which will executed after all leaf nodes backward execution.
+    5. above 4. is inspired by `torch.autograd.graph.register_multi_grad_hook`, which is only implemented after 2.0.0
+    """
     global mpu, timers, SYNCHRONIZE, PROFILE_TIME
 
     deepspeed_saved_tensors = None
@@ -714,6 +727,7 @@ def non_reentrant_checkpoint(function, *args):
     tensor_flags = None
 
     def save_args_for_backward(*all_args):
+        """keep this function to reduce the modification from original implementation"""
         nonlocal deepspeed_saved_tensors, non_tensor_args, tensor_flags
         tensor_args, non_tensor_args, tensor_flags = extract_tensors(all_objects=all_args)
         deepspeed_saved_tensors = tensor_args
@@ -787,26 +801,43 @@ def save_args_for_backward(*all_args):
         save_args_for_backward(*args)
 
     class Holder():
+        """the place holder object used as activations to save memory"""
         pass
 
+    # weakref seems utilized to discover the tensor deletion before a whole
+    # forward backward pair loop finished
     storage: weakref.WeakKeyDictionary = weakref.WeakKeyDictionary()
     weak_holder_list = []
     leaf_tensors = []
     backward_visited_leaf_nodes = 0
 
     def checkpoint_pack(tensor_from_forward):
+        """used to record the activation order in the `weak_holder_list`
+
+        the activation order in holder list is consistent between the first forward and recomputing forward.
+        * the jit compiled forward will break the order consistency *
+        """
         res = Holder()
         weak_holder_list.append(weakref.ref(res))
+
+        # if this is a leaf tensor, save it for backward progression trace
+        # leaf tensor used to be input or parameters, which is not activations and
+        # has no memory overhead
         if tensor_from_forward.requires_grad and tensor_from_forward.is_leaf:
             leaf_tensors.append(tensor_from_forward)
         return res
 
     def checkpoint_unpack(holder_from_backward):
+        """retrieve the activations from recompute"""
         nonlocal deepspeed_saved_tensors, non_tensor_args, tensor_flags
+
+        # if this is the first step of backward probagation, recompute the graph and save
+        # all the activations with the same order as `checkpoint_pack` does
         if len(storage) == 0:
             unpack_counter = 0
 
             def replay_pack(tensor_from_replay):
+                """save recompute activations"""
                 nonlocal unpack_counter
                 unpack_counter += 1
 
@@ -819,6 +850,7 @@ def replay_pack(tensor_from_replay):
                 return
 
             def replay_unpack(none_value):
+                """recompute graph need not to backward"""
                 raise RuntimeError("You are calling backwards on a tensor that is never exposed.")
 
             global timers
@@ -852,6 +884,7 @@ def replay_unpack(none_value):
 
             global cuda_device, transport_stream, PARTITION_ACTIVATIONS
 
+            # gather inputs which is partitioned or checkpointed before first forward
             if PARTITION_ACTIVATIONS:
                 # with get_accelerator().stream(transport_stream):
                 inputs = gather_partitioned_activations(deepspeed_saved_tensors,
@@ -900,6 +933,7 @@ def replay_unpack(none_value):
         return storage[holder_from_backward]
 
     def after_backward_hook(_nonuse_grads):
+        """the hook registered to all leaf tensors"""
         nonlocal leaf_tensors, backward_visited_leaf_nodes
         backward_visited_leaf_nodes += 1
 

From b6a0a44beb39174a2c620636b654f3e81e99ee7f Mon Sep 17 00:00:00 2001
From: purk <hughpu@hotmail.com>
Date: Wed, 9 Aug 2023 23:22:35 +0800
Subject: [PATCH 13/13] doc: explain the test of `non_reentrant_checkpoint`

---
 .../test_activation_checkpointing_non_reentrant.py     | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/tests/unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py b/tests/unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py
index 226170872ece..06e40655e75d 100644
--- a/tests/unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py
+++ b/tests/unit/runtime/activation_checkpointing/test_activation_checkpointing_non_reentrant.py
@@ -13,6 +13,8 @@
 from deepspeed.runtime.activation_checkpointing.checkpointing import non_reentrant_checkpoint
 from unit.common import DistributedTest
 
+# the hack to clone the module `test_activation_checkpointing` and inject
+# `non_reentrant_checkpoint` as the `ckpt` of the origin test module
 ORG_SPEC = util.find_spec('test_activation_checkpointing')
 test_act_ckpt = util.module_from_spec(ORG_SPEC)
 ORG_SPEC.loader.exec_module(test_act_ckpt)
@@ -35,23 +37,27 @@
 
 
 class TestActivationCheckpointWithGrad(test_act_ckpt.TestActivationCheckpoint):
+    """test `non_reentrant_checkpoint` can still checkpoint activations for inputs with grad"""
     pass
 
 
 class TestCheckpointNonTensorWithGrad(test_act_ckpt.TestCheckpointNonTensor):
+    """test `non_reentrant_checkpoint` can still checkpoint activations for inputs with grad"""
     pass
 
 
 class TestCheckpointNonTensorOutputOrderingWithGrad(test_act_ckpt.TestCheckpointNonTensorOutputOrdering):
+    """test `non_reentrant_checkpoint` can still checkpoint activations for inputs with grad"""
     pass
 
 
-# both bool and float are important, as bool is not differentiable
+# below classes are used to test the graph with inputs have no grad and parameters has grad, namely partial graph?
 @pytest.mark.parametrize('mask', [
     _mixed_mask(),
     _bool_to_float(_mixed_mask()),
 ])
 class TestActivationCheckpointWithoutGrad(DistributedTest):
+    """test all input tensors without grad"""
     world_size = 1
 
     def test_ckpt_inputs1_outputs1(self, mask):
@@ -82,6 +88,7 @@ def test_ckpt_arg_none(self, mask):
 
 @pytest.mark.parametrize('non_tensor', [None, 2, True, (None, 2.5), (None, True, torch.randn(HIDDEN_DIM))])
 class TestCheckpointNonTensorWithoutGrad(DistributedTest):
+    """test all input tensors without grad"""
     world_size = 1
 
     def test_ckpt_non_tensor_input(self, non_tensor):
@@ -99,6 +106,7 @@ def test_ckpt_non_tensor_output(self, non_tensor):
     None, (torch.randn(HIDDEN_DIM), 2.5), (None, torch.randn(HIDDEN_DIM), True), (None, True, torch.randn(HIDDEN_DIM))
 ])
 class TestCheckpointNonTensorOutputOrderingWithoutGrad(DistributedTest):
+    """test all input tensors without grad"""
     world_size = 1
 
     def test_ckpt_non_tensor_output_ordering(self, non_tensor_output):