From d2a526331287bec93979c2440f8ed1bfbebd97f8 Mon Sep 17 00:00:00 2001
From: Tixxx <tix@microsoft.com>
Date: Tue, 28 Jul 2020 05:19:09 +0000
Subject: [PATCH 1/4] added reducesumlogexp gradient added test fixed type
 mismatch when calling cudnnreduce kernel fixed python frontend to remove
 redundant states to match pytorch state dict

---
 .../providers/cuda/reduction/reduction_ops.cc |  9 ++-
 .../core/graph/gradient_builder.cc            | 29 ++++++++
 .../orttraining/core/graph/gradient_builder.h |  1 +
 .../core/graph/gradient_builder_registry.cc   |  1 +
 orttraining/orttraining/python/ort_trainer.py |  8 ++-
 .../test/gradient/gradient_ops_test.cc        | 68 +++++++++++++++++++
 6 files changed, 113 insertions(+), 3 deletions(-)
diff --git a/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc b/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc
index 4723246fcff97..28245d9743823 100644
--- a/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc
+++ b/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc
@@ -396,8 +396,9 @@ Status ReduceComputeCore(CUDAExecutionProvider& cuda_ep, const Tensor& input, Pr
   }
 
   CudnnReduceDescriptor reduce_desc;
-  if (std::is_same<T, MLFloat16>::value)
+  if (std::is_same<T, MLFloat16>::value) {
     ORT_RETURN_IF_ERROR(reduce_desc.Set(cudnn_reduce_op, CudnnTensor::GetDataType<float>(), ReduceTensorIndices));
+  }
   else
     ORT_RETURN_IF_ERROR(reduce_desc.Set(cudnn_reduce_op, cudnn_type_X, ReduceTensorIndices));
   const auto one = Consts<CudaT>::One;
@@ -438,7 +439,11 @@ Status ReduceComputeCore(CUDAExecutionProvider& cuda_ep, const Tensor& input, Pr
       } else {
         // Reduce max -- Max/Min will output indices data
         CudnnReduceDescriptor reduce_max_desc;
-        ORT_RETURN_IF_ERROR(reduce_max_desc.Set(CUDNN_REDUCE_TENSOR_MAX, cudnn_type_X, CUDNN_REDUCE_TENSOR_NO_INDICES));
+        cudnnDataType_t cudnn_reduce_max_type = cudnn_type_X;
+        if((std::is_same<T, MLFloat16>::value)) {
+            cudnn_reduce_max_type = CUDNN_DATA_FLOAT;
+        }
+        ORT_RETURN_IF_ERROR(reduce_max_desc.Set(CUDNN_REDUCE_TENSOR_MAX, cudnn_reduce_max_type, CUDNN_REDUCE_TENSOR_NO_INDICES));
         size_t indices_bytes_max = 0;
         CUDNN_RETURN_IF_ERROR(cudnnGetReductionIndicesSize(cuda_ep.PerThreadCudnnHandle(), reduce_max_desc,
                                                            input_tensor, output_tensor, &indices_bytes_max));
diff --git a/orttraining/orttraining/core/graph/gradient_builder.cc b/orttraining/orttraining/core/graph/gradient_builder.cc
index a823d46ba94fc..cd3073f1c7b1f 100644
--- a/orttraining/orttraining/core/graph/gradient_builder.cc
+++ b/orttraining/orttraining/core/graph/gradient_builder.cc
@@ -788,6 +788,35 @@ IMPLEMENT_GRADIENT_BUILDER(GetReduceMeanGradient) {
   return result;
 }
 
+IMPLEMENT_GRADIENT_BUILDER(GetReduceLogSumExpGradient) {
+  std::vector<NodeDef> result;
+  auto attributes = SrcNodeAttributes();
+  bool keepdims = true;
+  if (attributes.find("keepdims") != attributes.end() &&
+      attributes.at("keepdims").has_i()) {
+    keepdims = static_cast<bool>(attributes.at("keepdims").i());
+  }
+
+  ArgDef grad = GO(0);
+  if (!keepdims && attributes.find("axes") != attributes.end()) {
+    std::vector<int64_t> axes_values = RetrieveValues<int64_t>(attributes.at("axes"));
+    grad = IA("Unsqueezed_Grad");
+    result.push_back(NodeDef("Unsqueeze", {GO(0)}, {grad}, {MakeAttribute("axes", axes_values)}));
+
+    result.push_back(NodeDef("Unsqueeze", {O(0)}, {IA("Unsqueezed_Output")}, {MakeAttribute("axes", axes_values)}));
+    result.push_back(NodeDef("Sub", {I(0), IA("Unsqueezed_Output")}, {IA("Self_Sub_Result")}));
+  }
+  else {
+    result.push_back(NodeDef("Sub", {I(0), O(0)}, {IA("Self_Sub_Result")}));
+  }
+
+  result.push_back(NodeDef("Exp", {IA("Self_Sub_Result")}, {IA("Self_Sub_Result_Exp")}));
+
+  result.push_back(NodeDef("Mul", {IA("Self_Sub_Result_Exp"), grad}, {GI(0)}));  
+
+  return result;
+}
+
 IMPLEMENT_GRADIENT_BUILDER(GetReduceSumGradient) {
   std::vector<NodeDef> result;
   auto attributes = SrcNodeAttributes();
diff --git a/orttraining/orttraining/core/graph/gradient_builder.h b/orttraining/orttraining/core/graph/gradient_builder.h
index 9a32e421bf5b3..819c800820ed0 100644
--- a/orttraining/orttraining/core/graph/gradient_builder.h
+++ b/orttraining/orttraining/core/graph/gradient_builder.h
@@ -25,6 +25,7 @@ DECLARE_GRADIENT_BUILDER(GetMulGradient)
 DECLARE_GRADIENT_BUILDER(GetDivGradient)
 DECLARE_GRADIENT_BUILDER(GetReduceMeanGradient)
 DECLARE_GRADIENT_BUILDER(GetReduceSumGradient)
+DECLARE_GRADIENT_BUILDER(GetReduceLogSumExpGradient)
 DECLARE_GRADIENT_BUILDER(GetPowGradient)
 DECLARE_GRADIENT_BUILDER(GetConcatGradient)
 DECLARE_GRADIENT_BUILDER(GetReshapeGradient)
diff --git a/orttraining/orttraining/core/graph/gradient_builder_registry.cc b/orttraining/orttraining/core/graph/gradient_builder_registry.cc
index 94b4e1e096992..7631ce25eb311 100644
--- a/orttraining/orttraining/core/graph/gradient_builder_registry.cc
+++ b/orttraining/orttraining/core/graph/gradient_builder_registry.cc
@@ -51,6 +51,7 @@ void GradientBuilderRegistry::RegisterGradientBuilders() {
   REGISTER_GRADIENT_BUILDER("Pow", GetPowGradient);
   REGISTER_GRADIENT_BUILDER("ReduceMean", GetReduceMeanGradient);
   REGISTER_GRADIENT_BUILDER("ReduceSum", GetReduceSumGradient);
+  REGISTER_GRADIENT_BUILDER("ReduceLogSumExp", GetReduceLogSumExpGradient);
   REGISTER_GRADIENT_BUILDER("Add", GetAddSubGradient);
   REGISTER_GRADIENT_BUILDER("Sub", GetAddSubGradient);
   REGISTER_GRADIENT_BUILDER("Mul", GetMulGradient);
diff --git a/orttraining/orttraining/python/ort_trainer.py b/orttraining/orttraining/python/ort_trainer.py
index a69c41e312a9d..cbc4a2e8a2e3a 100644
--- a/orttraining/orttraining/python/ort_trainer.py
+++ b/orttraining/orttraining/python/ort_trainer.py
@@ -773,7 +773,13 @@ def state_dict(self):
             if n.name not in torch_state:
                 torch_state[n.name] = torch.from_numpy(numpy_helper.to_array(n))
 
-        return torch_state
+        # Need to remove redundant initializers and name suffices to map back to original torch state names
+        torch_state_to_return = {}
+        for name, value in torch_state.items():
+            if not (("Moment" in name) or ("Update_Count" in name)):
+                name = name.replace('_fp16', '')
+                torch_state_to_return[name] = value
+        return torch_state_to_return
 
     def load_state_dict(self, state_dict, strict=False):
         # Note: It may happen ONNX model has not yet been initialized
diff --git a/orttraining/orttraining/test/gradient/gradient_ops_test.cc b/orttraining/orttraining/test/gradient/gradient_ops_test.cc
index 9bc844d46cfe0..c3a14dbb484f0 100644
--- a/orttraining/orttraining/test/gradient/gradient_ops_test.cc
+++ b/orttraining/orttraining/test/gradient/gradient_ops_test.cc
@@ -546,6 +546,74 @@ TEST(GradientCheckerTest, ReduceSumGrad) {
   }
 }
 
+TEST(GradientCheckerTest, ReduceLogSumExpGrad) {
+  float max_error;
+  GradientChecker<float, float, float> gradient_checker;
+  // Attribute axes supports negative values from opset 11.
+  OpDef op_def{"ReduceLogSumExp", kOnnxDomain, 11};
+
+  // default
+  {
+    gradient_checker.ComputeGradientError(op_def, {{4, 3, 2}}, {{1, 1, 1}}, &max_error);
+    EXPECT_IS_TINY(max_error);
+  }
+
+  // axes = [0, 1, 2], keepdims = 0
+  {
+    gradient_checker.ComputeGradientError(op_def, {{4, 3, 2}}, {{}}, &max_error,
+                                          {MakeAttribute("axes", std::vector<int64_t>{0, 1, 2}),
+                                           MakeAttribute("keepdims", int64_t(0))});
+    EXPECT_IS_TINY(max_error);
+  }
+
+  // axes = [0, 2], keepdims = 1
+  {
+    gradient_checker.ComputeGradientError(op_def, {{4, 3, 2}}, {{1, 3, 1}}, &max_error,
+                                          {MakeAttribute("axes", std::vector<int64_t>{0, 2})});
+    EXPECT_IS_TINY(max_error);
+  }
+
+  // axes = [0, 1], keepdims = 0
+  {
+    gradient_checker.ComputeGradientError(op_def, {{4, 3, 2}}, {{2}}, &max_error,
+                                          {MakeAttribute("axes", std::vector<int64_t>{0, 1}),
+                                           MakeAttribute("keepdims", int64_t(0))});
+    EXPECT_IS_TINY(max_error);
+  }
+
+  // axes = [1], keepdims = 1
+  {
+    gradient_checker.ComputeGradientError(op_def, {{4, 3, 2}}, {{4, 1, 2}}, &max_error,
+                                          {MakeAttribute("axes", std::vector<int64_t>{1}),
+                                           MakeAttribute("keepdims", int64_t(1))});
+    EXPECT_IS_TINY(max_error);
+  }
+
+  // axes = [2], keepdims = 0
+  {
+    gradient_checker.ComputeGradientError(op_def, {{4, 3, 2}}, {{4, 3}}, &max_error,
+                                          {MakeAttribute("axes", std::vector<int64_t>{2}),
+                                           MakeAttribute("keepdims", int64_t(0))});
+    EXPECT_IS_TINY(max_error);
+  }
+
+  // axes = [-2], keepdims = 1
+  {
+    gradient_checker.ComputeGradientError(op_def, {{4, 3, 2}}, {{4, 1, 2}}, &max_error,
+                                          {MakeAttribute("axes", std::vector<int64_t>{-2}),
+                                           MakeAttribute("keepdims", int64_t(1))});
+    EXPECT_IS_TINY(max_error);
+  }
+
+  // axes = [-1, -3], keepdims = 0
+  {
+    gradient_checker.ComputeGradientError(op_def, {{4, 3, 2}}, {{3}}, &max_error,
+                                          {MakeAttribute("axes", std::vector<int64_t>{-1, -3}),
+                                           MakeAttribute("keepdims", int64_t(0))});
+    EXPECT_IS_TINY(max_error);
+  }
+}
+
 #ifndef USE_CUDA
 TEST(GradientCheckerTest, CastGrad) {
   // A dummy test that cast float to float

From 0d49d5dd25b5e2e1bc92b337166544bfe4f1087c Mon Sep 17 00:00:00 2001
From: Tixxx <tix@microsoft.com>
Date: Wed, 29 Jul 2020 04:57:50 +0000
Subject: [PATCH 2/4] PR comments

---
 .../providers/cuda/reduction/reduction_ops.cc |   5 +-
 .../core/graph/gradient_builder.cc            |   5 +
 orttraining/orttraining/python/ort_trainer.py |   9 +-
 .../test/gradient/gradient_op_test_utils.h    |   4 +
 .../test/gradient/gradient_ops_test.cc        | 337 ++++++++----------
 5 files changed, 158 insertions(+), 202 deletions(-)

diff --git a/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc b/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc
index 28245d9743823..b2cefca19651c 100644
--- a/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc
+++ b/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc
@@ -398,9 +398,10 @@ Status ReduceComputeCore(CUDAExecutionProvider& cuda_ep, const Tensor& input, Pr
   CudnnReduceDescriptor reduce_desc;
   if (std::is_same<T, MLFloat16>::value) {
     ORT_RETURN_IF_ERROR(reduce_desc.Set(cudnn_reduce_op, CudnnTensor::GetDataType<float>(), ReduceTensorIndices));
-  }
-  else
+  } else {
     ORT_RETURN_IF_ERROR(reduce_desc.Set(cudnn_reduce_op, cudnn_type_X, ReduceTensorIndices));
+  }
+
   const auto one = Consts<CudaT>::One;
   const auto zero = Consts<CudaT>::Zero;
   CudnnTensor input_tensor;
diff --git a/orttraining/orttraining/core/graph/gradient_builder.cc b/orttraining/orttraining/core/graph/gradient_builder.cc
index cd3073f1c7b1f..674063aa62695 100644
--- a/orttraining/orttraining/core/graph/gradient_builder.cc
+++ b/orttraining/orttraining/core/graph/gradient_builder.cc
@@ -788,6 +788,11 @@ IMPLEMENT_GRADIENT_BUILDER(GetReduceMeanGradient) {
   return result;
 }
 
+// Reference computation is pytorch's logsumexp_backward
+// dx_i = exp(xi) / reduceSum(exp(xi))
+// O(0) = log(reduceSum(exp(xi)))
+// Self_Sub_Result = I(0) - O(0) = xi - log(sum(exp(xi))) = log( xi / reduceSum(exp(xi)))
+// Gradient computation is re-using output and input from forward op, can be a recomputation candidate.
 IMPLEMENT_GRADIENT_BUILDER(GetReduceLogSumExpGradient) {
   std::vector<NodeDef> result;
   auto attributes = SrcNodeAttributes();
diff --git a/orttraining/orttraining/python/ort_trainer.py b/orttraining/orttraining/python/ort_trainer.py
index cbc4a2e8a2e3a..c45f04a4561ef 100644
--- a/orttraining/orttraining/python/ort_trainer.py
+++ b/orttraining/orttraining/python/ort_trainer.py
@@ -628,6 +628,8 @@ def __init__(self, model, loss_fn, model_desc, training_optimizer_name, map_opti
         self.world_rank = world_rank
         self.world_size = world_size
         self.use_mixed_precision = use_mixed_precision
+        
+        self.original_model_state_keys = list(model.state_dict().keys())
 
         self.session = None
         self.device_ = device
@@ -775,10 +777,9 @@ def state_dict(self):
 
         # Need to remove redundant initializers and name suffices to map back to original torch state names
         torch_state_to_return = {}
-        for name, value in torch_state.items():
-            if not (("Moment" in name) or ("Update_Count" in name)):
-                name = name.replace('_fp16', '')
-                torch_state_to_return[name] = value
+        for key in self.original_model_state_keys:
+            if key in torch_state:
+                torch_state_to_return[key] = torch_state[key]
         return torch_state_to_return
 
     def load_state_dict(self, state_dict, strict=False):
diff --git a/orttraining/orttraining/test/gradient/gradient_op_test_utils.h b/orttraining/orttraining/test/gradient/gradient_op_test_utils.h
index ad75d061627da..71ab36a3bc4fc 100644
--- a/orttraining/orttraining/test/gradient/gradient_op_test_utils.h
+++ b/orttraining/orttraining/test/gradient/gradient_op_test_utils.h
@@ -7,6 +7,9 @@
 
 namespace onnxruntime {
 namespace test {
+using input_x_vector = std::vector<std::vector<TensorInfo>>;
+using input_y_vector = std::vector<std::vector<TensorInfo>>;
+using attr_vector = std::vector<std::vector<onnx::AttributeProto>>;
 
 class GradientOpTester : public OpTester {
  public:
@@ -39,3 +42,4 @@ class GradientOpTester : public OpTester {
 };
 }  // namespace test
 }  // namespace onnxruntime
+
diff --git a/orttraining/orttraining/test/gradient/gradient_ops_test.cc b/orttraining/orttraining/test/gradient/gradient_ops_test.cc
index c3a14dbb484f0..c1f405bb33bb0 100644
--- a/orttraining/orttraining/test/gradient/gradient_ops_test.cc
+++ b/orttraining/orttraining/test/gradient/gradient_ops_test.cc
@@ -37,6 +37,29 @@ static bool IsErrorWithinTolerance(float error, float tolerance) {
 
 #define EXPECT_IS_TINY(max_error) \
   EXPECT_IS_TINIER_THAN(max_error, 1.5e-2f)
+static void RunReductionTests(const OpDef& op_def,
+                              const input_x_vector& input_x,
+                              const input_y_vector& input_y,
+                              const attr_vector& attr_vector) {
+  EXPECT_TRUE(input_x.size() == input_y.size())
+      << "Input_x vector and input_y vector must contain same number of elements. "
+      << "Input_x size: " << input_x.size()
+      << "Input_y size:" << input_y.size();
+  EXPECT_TRUE(input_x.size() == attr_vector.size())
+      << "Input_x vector and attribute vector must contain same number of elements. "
+      << "Input_x size: " << input_x.size()
+      << "attr_vector size:" << attr_vector.size();
+
+  GradientChecker<float, float, float> gradient_checker;
+
+  float max_error;
+
+  for (size_t i = 0; i < input_x.size(); i++) {
+    max_error = 0;
+    gradient_checker.ComputeGradientError(op_def, input_x[i], input_y[i], &max_error, attr_vector[i]);
+    EXPECT_IS_TINY(max_error);
+  }
+}
 
 template <typename T>
 void GenerateRandomDataWithOneHot(
@@ -401,217 +424,138 @@ TEST(GradientCheckerTest, GemmGrad) {
 }
 
 TEST(GradientCheckerTest, ReduceMeanGrad) {
-  float max_error;
-  GradientChecker<float, float, float> gradient_checker;
   // Attribute axes supports negative values from opset 11.
   OpDef op_def{"ReduceMean", kOnnxDomain, 11};
 
-  // default
-  {
-    gradient_checker.ComputeGradientError(op_def, {{4, 3, 2}}, {{1, 1, 1}}, &max_error);
-    EXPECT_IS_TINY(max_error);
-  }
-
-  // TODO: Fix forward kernel behavior for default axes
-  // default axes, keepdims = 0
-  /*
-  {
-    gradient_checker.ComputeGradientError(op_def, {{4, 3, 2}}, {{}}, &max_error,
-                                          {MakeAttribute("keepdims", int64_t(0))});
-    EXPECT_IS_TINY(max_error);
-  }
-  */
-
-  // axes = [0, 1, 2], keepdims = 0
-  {
-    gradient_checker.ComputeGradientError(op_def, {{4, 3, 2}}, {{}}, &max_error,
-                                          {MakeAttribute("axes", std::vector<int64_t>{0, 1, 2}),
-                                           MakeAttribute("keepdims", int64_t(0))});
-    EXPECT_IS_TINY(max_error);
-  }
-
-  // axes = [0, 2], keepdims = 1
-  {
-    gradient_checker.ComputeGradientError(op_def, {{4, 3, 2}}, {{1, 3, 1}}, &max_error,
-                                          {MakeAttribute("axes", std::vector<int64_t>{0, 2})});
-    EXPECT_IS_TINY(max_error);
-  }
-
-  // axes = [0, 1], keepdims = 0
-  {
-    gradient_checker.ComputeGradientError(op_def, {{4, 3, 2}}, {{2}}, &max_error,
-                                          {MakeAttribute("axes", std::vector<int64_t>{0, 1}),
-                                           MakeAttribute("keepdims", int64_t(0))});
-    EXPECT_IS_TINY(max_error);
-  }
-
-  // axes = [1], keepdims = 1
-  {
-    gradient_checker.ComputeGradientError(op_def, {{4, 3, 2}}, {{4, 1, 2}}, &max_error,
-                                          {MakeAttribute("axes", std::vector<int64_t>{1}),
-                                           MakeAttribute("keepdims", int64_t(1))});
-    EXPECT_IS_TINY(max_error);
-  }
-
-  // axes = [2], keepdims = 0
-  {
-    gradient_checker.ComputeGradientError(op_def, {{4, 3, 2}}, {{4, 3}}, &max_error,
-                                          {MakeAttribute("axes", std::vector<int64_t>{2}),
-                                           MakeAttribute("keepdims", int64_t(0))});
-    EXPECT_IS_TINY(max_error);
-  }
-
-  // axes = [-2], keepdims = 1
-  {
-    gradient_checker.ComputeGradientError(op_def, {{4, 3, 2}}, {{4, 1, 2}}, &max_error,
-                                          {MakeAttribute("axes", std::vector<int64_t>{-2}),
-                                           MakeAttribute("keepdims", int64_t(1))});
-    EXPECT_IS_TINY(max_error);
-  }
-
-  // axes = [-2, -1], keepdims = 0
-  {
-    gradient_checker.ComputeGradientError(op_def, {{4, 3, 2}}, {{4}}, &max_error,
-                                          {MakeAttribute("axes", std::vector<int64_t>{-2, -1}),
-                                           MakeAttribute("keepdims", int64_t(0))});
-    EXPECT_IS_TINY(max_error);
-  }
+  input_x_vector input_x;
+  for (size_t i = 0; i < 8; i++) {
+    input_x.push_back({{4, 3, 2}});
+  }
+
+  input_y_vector input_y = {{{1, 1, 1}},
+                            {{}},
+                            {{1, 3, 1}},
+                            {{2}},
+                            {{4, 1, 2}},
+                            {{4, 3}},
+                            {{4, 1, 2}},
+                            {{4}}};
+
+  attr_vector attr_vector = {
+                             // default
+                             {},
+                             // axes = [0, 1, 2], keepdims = 0
+                             {MakeAttribute("axes", std::vector<int64_t>{0, 1, 2}),
+                              MakeAttribute("keepdims", int64_t(0))},
+                             // axes = [0, 2], keepdims = 1
+                             {MakeAttribute("axes", std::vector<int64_t>{0, 2})},
+                             // axes = [0, 1], keepdims = 0
+                             {MakeAttribute("axes", std::vector<int64_t>{0, 1}),
+                              MakeAttribute("keepdims", int64_t(0))},
+                             // axes = [1], keepdims = 1
+                             {MakeAttribute("axes", std::vector<int64_t>{1}),
+                              MakeAttribute("keepdims", int64_t(1))},
+                             // axes = [2], keepdims = 0
+                             {MakeAttribute("axes", std::vector<int64_t>{2}),
+                              MakeAttribute("keepdims", int64_t(0))},
+                             // axes = [-2], keepdims = 1
+                             {MakeAttribute("axes", std::vector<int64_t>{-2}),
+                              MakeAttribute("keepdims", int64_t(1))},
+                             // axes = [-2, -1], keepdims = 0
+                             {MakeAttribute("axes", std::vector<int64_t>{-2, -1}),
+                              MakeAttribute("keepdims", int64_t(0))}};
+
+  RunReductionTests(op_def, input_x, input_y, attr_vector);
 }
 
 TEST(GradientCheckerTest, ReduceSumGrad) {
-  float max_error;
-  GradientChecker<float, float, float> gradient_checker;
   // Attribute axes supports negative values from opset 11.
   OpDef op_def{"ReduceSum", kOnnxDomain, 11};
 
-  // default
-  {
-    gradient_checker.ComputeGradientError(op_def, {{4, 3, 2}}, {{1, 1, 1}}, &max_error);
-    EXPECT_IS_TINY(max_error);
-  }
-
-  // axes = [0, 1, 2], keepdims = 0
-  {
-    gradient_checker.ComputeGradientError(op_def, {{4, 3, 2}}, {{}}, &max_error,
-                                          {MakeAttribute("axes", std::vector<int64_t>{0, 1, 2}),
-                                           MakeAttribute("keepdims", int64_t(0))});
-    EXPECT_IS_TINY(max_error);
-  }
-
-  // axes = [0, 2], keepdims = 1
-  {
-    gradient_checker.ComputeGradientError(op_def, {{4, 3, 2}}, {{1, 3, 1}}, &max_error,
-                                          {MakeAttribute("axes", std::vector<int64_t>{0, 2})});
-    EXPECT_IS_TINY(max_error);
-  }
-
-  // axes = [0, 1], keepdims = 0
-  {
-    gradient_checker.ComputeGradientError(op_def, {{4, 3, 2}}, {{2}}, &max_error,
-                                          {MakeAttribute("axes", std::vector<int64_t>{0, 1}),
-                                           MakeAttribute("keepdims", int64_t(0))});
-    EXPECT_IS_TINY(max_error);
-  }
-
-  // axes = [1], keepdims = 1
-  {
-    gradient_checker.ComputeGradientError(op_def, {{4, 3, 2}}, {{4, 1, 2}}, &max_error,
-                                          {MakeAttribute("axes", std::vector<int64_t>{1}),
-                                           MakeAttribute("keepdims", int64_t(1))});
-    EXPECT_IS_TINY(max_error);
-  }
-
-  // axes = [2], keepdims = 0
-  {
-    gradient_checker.ComputeGradientError(op_def, {{4, 3, 2}}, {{4, 3}}, &max_error,
-                                          {MakeAttribute("axes", std::vector<int64_t>{2}),
-                                           MakeAttribute("keepdims", int64_t(0))});
-    EXPECT_IS_TINY(max_error);
-  }
-
-  // axes = [-2], keepdims = 1
-  {
-    gradient_checker.ComputeGradientError(op_def, {{4, 3, 2}}, {{4, 1, 2}}, &max_error,
-                                          {MakeAttribute("axes", std::vector<int64_t>{-2}),
-                                           MakeAttribute("keepdims", int64_t(1))});
-    EXPECT_IS_TINY(max_error);
-  }
-
-  // axes = [-1, -3], keepdims = 0
-  {
-    gradient_checker.ComputeGradientError(op_def, {{4, 3, 2}}, {{3}}, &max_error,
-                                          {MakeAttribute("axes", std::vector<int64_t>{-1, -3}),
-                                           MakeAttribute("keepdims", int64_t(0))});
-    EXPECT_IS_TINY(max_error);
-  }
+  input_x_vector input_x;
+  for (size_t i = 0; i < 8; i++) {
+    input_x.push_back({{4, 3, 2}});
+  }
+
+  input_y_vector input_y = {{{1, 1, 1}},
+                            {{}},
+                            {{1, 3, 1}},
+                            {{2}},
+                            {{4, 1, 2}},
+                            {{4, 3}},
+                            {{4, 1, 2}},
+                            {{3}}};
+
+  attr_vector attr_vector = {
+                             // default
+                             {},
+                             // axes = [0, 1, 2], keepdims = 0
+                             {MakeAttribute("axes", std::vector<int64_t>{0, 1, 2}),
+                              MakeAttribute("keepdims", int64_t(0))},
+                             // axes = [0, 2], keepdims = 1
+                             {MakeAttribute("axes", std::vector<int64_t>{0, 2})},
+                             // axes = [0, 1], keepdims = 0
+                             {MakeAttribute("axes", std::vector<int64_t>{0, 1}),
+                              MakeAttribute("keepdims", int64_t(0))},
+                             // axes = [1], keepdims = 1
+                             {MakeAttribute("axes", std::vector<int64_t>{1}),
+                              MakeAttribute("keepdims", int64_t(1))},
+                             // axes = [2], keepdims = 0
+                             {MakeAttribute("axes", std::vector<int64_t>{2}),
+                              MakeAttribute("keepdims", int64_t(0))},
+                             // axes = [-2], keepdims = 1
+                             {MakeAttribute("axes", std::vector<int64_t>{-2}),
+                              MakeAttribute("keepdims", int64_t(1))},
+                             // axes = [-1, -3], keepdims = 0
+                             {MakeAttribute("axes", std::vector<int64_t>{-1, -3}),
+                              MakeAttribute("keepdims", int64_t(0))}};
+
+  RunReductionTests(op_def, input_x, input_y, attr_vector);
 }
 
 TEST(GradientCheckerTest, ReduceLogSumExpGrad) {
-  float max_error;
-  GradientChecker<float, float, float> gradient_checker;
   // Attribute axes supports negative values from opset 11.
   OpDef op_def{"ReduceLogSumExp", kOnnxDomain, 11};
 
-  // default
-  {
-    gradient_checker.ComputeGradientError(op_def, {{4, 3, 2}}, {{1, 1, 1}}, &max_error);
-    EXPECT_IS_TINY(max_error);
-  }
-
-  // axes = [0, 1, 2], keepdims = 0
-  {
-    gradient_checker.ComputeGradientError(op_def, {{4, 3, 2}}, {{}}, &max_error,
-                                          {MakeAttribute("axes", std::vector<int64_t>{0, 1, 2}),
-                                           MakeAttribute("keepdims", int64_t(0))});
-    EXPECT_IS_TINY(max_error);
-  }
-
-  // axes = [0, 2], keepdims = 1
-  {
-    gradient_checker.ComputeGradientError(op_def, {{4, 3, 2}}, {{1, 3, 1}}, &max_error,
-                                          {MakeAttribute("axes", std::vector<int64_t>{0, 2})});
-    EXPECT_IS_TINY(max_error);
-  }
-
-  // axes = [0, 1], keepdims = 0
-  {
-    gradient_checker.ComputeGradientError(op_def, {{4, 3, 2}}, {{2}}, &max_error,
-                                          {MakeAttribute("axes", std::vector<int64_t>{0, 1}),
-                                           MakeAttribute("keepdims", int64_t(0))});
-    EXPECT_IS_TINY(max_error);
-  }
-
-  // axes = [1], keepdims = 1
-  {
-    gradient_checker.ComputeGradientError(op_def, {{4, 3, 2}}, {{4, 1, 2}}, &max_error,
-                                          {MakeAttribute("axes", std::vector<int64_t>{1}),
-                                           MakeAttribute("keepdims", int64_t(1))});
-    EXPECT_IS_TINY(max_error);
-  }
-
-  // axes = [2], keepdims = 0
-  {
-    gradient_checker.ComputeGradientError(op_def, {{4, 3, 2}}, {{4, 3}}, &max_error,
-                                          {MakeAttribute("axes", std::vector<int64_t>{2}),
-                                           MakeAttribute("keepdims", int64_t(0))});
-    EXPECT_IS_TINY(max_error);
-  }
-
-  // axes = [-2], keepdims = 1
-  {
-    gradient_checker.ComputeGradientError(op_def, {{4, 3, 2}}, {{4, 1, 2}}, &max_error,
-                                          {MakeAttribute("axes", std::vector<int64_t>{-2}),
-                                           MakeAttribute("keepdims", int64_t(1))});
-    EXPECT_IS_TINY(max_error);
-  }
-
-  // axes = [-1, -3], keepdims = 0
-  {
-    gradient_checker.ComputeGradientError(op_def, {{4, 3, 2}}, {{3}}, &max_error,
-                                          {MakeAttribute("axes", std::vector<int64_t>{-1, -3}),
-                                           MakeAttribute("keepdims", int64_t(0))});
-    EXPECT_IS_TINY(max_error);
-  }
+  input_x_vector input_x;
+  for (size_t i = 0; i < 8; i++) {
+    input_x.push_back({{4, 3, 2}});
+  }
+
+  input_y_vector input_y = {{{1, 1, 1}},
+                            {{}},
+                            {{1, 3, 1}},
+                            {{2}},
+                            {{4, 1, 2}},
+                            {{4, 3}},
+                            {{4, 1, 2}},
+                            {{3}}};
+
+  attr_vector attr_vector = {
+                             // default
+                             {},
+                             // axes = [0, 1, 2], keepdims = 0
+                             {MakeAttribute("axes", std::vector<int64_t>{0, 1, 2}),
+                              MakeAttribute("keepdims", int64_t(0))},
+                             // axes = [0, 2], keepdims = 1
+                             {MakeAttribute("axes", std::vector<int64_t>{0, 2})},
+                             // axes = [0, 1], keepdims = 0
+                             {MakeAttribute("axes", std::vector<int64_t>{0, 1}),
+                              MakeAttribute("keepdims", int64_t(0))},
+                             // axes = [1], keepdims = 1
+                             {MakeAttribute("axes", std::vector<int64_t>{1}),
+                              MakeAttribute("keepdims", int64_t(1))},
+                             // axes = [2], keepdims = 0
+                             {MakeAttribute("axes", std::vector<int64_t>{2}),
+                              MakeAttribute("keepdims", int64_t(0))},
+                             // axes = [-2], keepdims = 1
+                             {MakeAttribute("axes", std::vector<int64_t>{-2}),
+                              MakeAttribute("keepdims", int64_t(1))},
+                             // axes = [-1, -3], keepdims = 0
+                             {MakeAttribute("axes", std::vector<int64_t>{-1, -3}),
+                              MakeAttribute("keepdims", int64_t(0))}};
+
+  RunReductionTests(op_def, input_x, input_y, attr_vector);
 }
 
 #ifndef USE_CUDA
@@ -1998,3 +1942,4 @@ TEST(GradientCheckerTest, ExpandGrad) {
 }  // namespace onnxruntime
 
 #endif  // NDEBUG
+

From 3e9a72db386f3659fd7122c7408b0b47d593f76a Mon Sep 17 00:00:00 2001
From: Tixxx <tix@microsoft.com>
Date: Wed, 29 Jul 2020 16:44:09 +0000
Subject: [PATCH 3/4] fixed python frontend test failure

---
 orttraining/orttraining/python/ort_trainer.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/orttraining/orttraining/python/ort_trainer.py b/orttraining/orttraining/python/ort_trainer.py
index c45f04a4561ef..4c1dba47c6261 100644
--- a/orttraining/orttraining/python/ort_trainer.py
+++ b/orttraining/orttraining/python/ort_trainer.py
@@ -628,8 +628,8 @@ def __init__(self, model, loss_fn, model_desc, training_optimizer_name, map_opti
         self.world_rank = world_rank
         self.world_size = world_size
         self.use_mixed_precision = use_mixed_precision
-        
-        self.original_model_state_keys = list(model.state_dict().keys())
+
+        self.original_model_state_keys = list(model.state_dict().keys()) if hasattr(model, 'state_dict') else []
 
         self.session = None
         self.device_ = device
@@ -777,9 +777,12 @@ def state_dict(self):
 
         # Need to remove redundant initializers and name suffices to map back to original torch state names
         torch_state_to_return = {}
-        for key in self.original_model_state_keys:
-            if key in torch_state:
-                torch_state_to_return[key] = torch_state[key]
+        if self.original_model_state_keys:
+            for key in self.original_model_state_keys:
+                if key in torch_state:
+                    torch_state_to_return[key] = torch_state[key]
+        else:
+            torch_state_to_return = torch_state
         return torch_state_to_return
 
     def load_state_dict(self, state_dict, strict=False):

From 60c517c7fcd62afe3fdc7dbc4b10b023baebbba4 Mon Sep 17 00:00:00 2001
From: Tixxx <tix@microsoft.com>
Date: Wed, 29 Jul 2020 21:13:26 +0000
Subject: [PATCH 4/4] PR comments

---
 orttraining/orttraining/python/ort_trainer.py |  10 +-
 .../test/gradient/gradient_op_test_utils.h    |   6 +-
 .../test/gradient/gradient_ops_test.cc        | 189 ++++++------------
 3 files changed, 64 insertions(+), 141 deletions(-)

diff --git a/orttraining/orttraining/python/ort_trainer.py b/orttraining/orttraining/python/ort_trainer.py
index 4c1dba47c6261..b350e3c578c77 100644
--- a/orttraining/orttraining/python/ort_trainer.py
+++ b/orttraining/orttraining/python/ort_trainer.py
@@ -776,13 +776,9 @@ def state_dict(self):
                 torch_state[n.name] = torch.from_numpy(numpy_helper.to_array(n))
 
         # Need to remove redundant initializers and name suffices to map back to original torch state names
-        torch_state_to_return = {}
-        if self.original_model_state_keys:
-            for key in self.original_model_state_keys:
-                if key in torch_state:
-                    torch_state_to_return[key] = torch_state[key]
-        else:
-            torch_state_to_return = torch_state
+        torch_state_to_return = {key: torch_state[key] for key in self.original_model_state_keys if key in torch_state} \
+                                if self.original_model_state_keys \
+                                else torch_state
         return torch_state_to_return
 
     def load_state_dict(self, state_dict, strict=False):
diff --git a/orttraining/orttraining/test/gradient/gradient_op_test_utils.h b/orttraining/orttraining/test/gradient/gradient_op_test_utils.h
index 71ab36a3bc4fc..26f56ddbe0fbf 100644
--- a/orttraining/orttraining/test/gradient/gradient_op_test_utils.h
+++ b/orttraining/orttraining/test/gradient/gradient_op_test_utils.h
@@ -7,9 +7,9 @@
 
 namespace onnxruntime {
 namespace test {
-using input_x_vector = std::vector<std::vector<TensorInfo>>;
-using input_y_vector = std::vector<std::vector<TensorInfo>>;
-using attr_vector = std::vector<std::vector<onnx::AttributeProto>>;
+using TestDataVector = std::tuple<std::vector<std::vector<TensorInfo>>, // Input data
+                                  std::vector<std::vector<TensorInfo>>, // output data
+                                  std::vector<std::vector<onnx::AttributeProto>>>; //attribute
 
 class GradientOpTester : public OpTester {
  public:
diff --git a/orttraining/orttraining/test/gradient/gradient_ops_test.cc b/orttraining/orttraining/test/gradient/gradient_ops_test.cc
index c1f405bb33bb0..fe9b1ee0e92bd 100644
--- a/orttraining/orttraining/test/gradient/gradient_ops_test.cc
+++ b/orttraining/orttraining/test/gradient/gradient_ops_test.cc
@@ -37,26 +37,67 @@ static bool IsErrorWithinTolerance(float error, float tolerance) {
 
 #define EXPECT_IS_TINY(max_error) \
   EXPECT_IS_TINIER_THAN(max_error, 1.5e-2f)
-static void RunReductionTests(const OpDef& op_def,
-                              const input_x_vector& input_x,
-                              const input_y_vector& input_y,
-                              const attr_vector& attr_vector) {
-  EXPECT_TRUE(input_x.size() == input_y.size())
-      << "Input_x vector and input_y vector must contain same number of elements. "
-      << "Input_x size: " << input_x.size()
-      << "Input_y size:" << input_y.size();
-  EXPECT_TRUE(input_x.size() == attr_vector.size())
-      << "Input_x vector and attribute vector must contain same number of elements. "
-      << "Input_x size: " << input_x.size()
-      << "attr_vector size:" << attr_vector.size();
+
+static void RunReductionTests(const OpDef& op_def) {
+
+  TestDataVector test_data(
+                            // Input X
+                            {
+                              {{4, 3, 2}},
+                              {{4, 3, 2}},
+                              {{4, 3, 2}},
+                              {{4, 3, 2}},
+                              {{4, 3, 2}},
+                              {{4, 3, 2}},
+                              {{4, 3, 2}},
+                              {{4, 3, 2}},
+                            },
+                            // Input Y
+                            {
+                              {{1, 1, 1}},
+                              {{}},
+                              {{1, 3, 1}},
+                              {{2}},
+                              {{4, 1, 2}},
+                              {{4, 3}},
+                              {{4, 1, 2}},
+                              {{4}}
+                            },
+                            // Attributes
+                            {
+                              // default
+                              {},
+                              // axes = [0, 1, 2], keepdims = 0
+                              {MakeAttribute("axes", std::vector<int64_t>{0, 1, 2}),
+                               MakeAttribute("keepdims", int64_t(0))},
+                              // axes = [0, 2], keepdims = 1
+                              {MakeAttribute("axes", std::vector<int64_t>{0, 2})},
+                              // axes = [0, 1], keepdims = 0
+                              {MakeAttribute("axes", std::vector<int64_t>{0, 1}),
+                               MakeAttribute("keepdims", int64_t(0))},
+                              // axes = [1], keepdims = 1
+                              {MakeAttribute("axes", std::vector<int64_t>{1}),
+                               MakeAttribute("keepdims", int64_t(1))},
+                              // axes = [2], keepdims = 0
+                              {MakeAttribute("axes", std::vector<int64_t>{2}),
+                               MakeAttribute("keepdims", int64_t(0))},
+                              // axes = [-2], keepdims = 1
+                              {MakeAttribute("axes", std::vector<int64_t>{-2}),
+                               MakeAttribute("keepdims", int64_t(1))},
+                              // axes = [-2, -1], keepdims = 0
+                              {MakeAttribute("axes", std::vector<int64_t>{-2, -1}),
+                               MakeAttribute("keepdims", int64_t(0))}
+                            });
 
   GradientChecker<float, float, float> gradient_checker;
 
   float max_error;
 
-  for (size_t i = 0; i < input_x.size(); i++) {
+  for (size_t i = 0; i < std::get<0>(test_data).size(); i++) {
     max_error = 0;
-    gradient_checker.ComputeGradientError(op_def, input_x[i], input_y[i], &max_error, attr_vector[i]);
+    gradient_checker.ComputeGradientError(op_def, std::get<0>(test_data)[i],
+                                          std::get<1>(test_data)[i], &max_error,
+                                          std::get<2>(test_data)[i]);
     EXPECT_IS_TINY(max_error);
   }
 }
@@ -427,135 +468,21 @@ TEST(GradientCheckerTest, ReduceMeanGrad) {
   // Attribute axes supports negative values from opset 11.
   OpDef op_def{"ReduceMean", kOnnxDomain, 11};
 
-  input_x_vector input_x;
-  for (size_t i = 0; i < 8; i++) {
-    input_x.push_back({{4, 3, 2}});
-  }
-
-  input_y_vector input_y = {{{1, 1, 1}},
-                            {{}},
-                            {{1, 3, 1}},
-                            {{2}},
-                            {{4, 1, 2}},
-                            {{4, 3}},
-                            {{4, 1, 2}},
-                            {{4}}};
-
-  attr_vector attr_vector = {
-                             // default
-                             {},
-                             // axes = [0, 1, 2], keepdims = 0
-                             {MakeAttribute("axes", std::vector<int64_t>{0, 1, 2}),
-                              MakeAttribute("keepdims", int64_t(0))},
-                             // axes = [0, 2], keepdims = 1
-                             {MakeAttribute("axes", std::vector<int64_t>{0, 2})},
-                             // axes = [0, 1], keepdims = 0
-                             {MakeAttribute("axes", std::vector<int64_t>{0, 1}),
-                              MakeAttribute("keepdims", int64_t(0))},
-                             // axes = [1], keepdims = 1
-                             {MakeAttribute("axes", std::vector<int64_t>{1}),
-                              MakeAttribute("keepdims", int64_t(1))},
-                             // axes = [2], keepdims = 0
-                             {MakeAttribute("axes", std::vector<int64_t>{2}),
-                              MakeAttribute("keepdims", int64_t(0))},
-                             // axes = [-2], keepdims = 1
-                             {MakeAttribute("axes", std::vector<int64_t>{-2}),
-                              MakeAttribute("keepdims", int64_t(1))},
-                             // axes = [-2, -1], keepdims = 0
-                             {MakeAttribute("axes", std::vector<int64_t>{-2, -1}),
-                              MakeAttribute("keepdims", int64_t(0))}};
-
-  RunReductionTests(op_def, input_x, input_y, attr_vector);
+  RunReductionTests(op_def);
 }
 
 TEST(GradientCheckerTest, ReduceSumGrad) {
   // Attribute axes supports negative values from opset 11.
   OpDef op_def{"ReduceSum", kOnnxDomain, 11};
 
-  input_x_vector input_x;
-  for (size_t i = 0; i < 8; i++) {
-    input_x.push_back({{4, 3, 2}});
-  }
-
-  input_y_vector input_y = {{{1, 1, 1}},
-                            {{}},
-                            {{1, 3, 1}},
-                            {{2}},
-                            {{4, 1, 2}},
-                            {{4, 3}},
-                            {{4, 1, 2}},
-                            {{3}}};
-
-  attr_vector attr_vector = {
-                             // default
-                             {},
-                             // axes = [0, 1, 2], keepdims = 0
-                             {MakeAttribute("axes", std::vector<int64_t>{0, 1, 2}),
-                              MakeAttribute("keepdims", int64_t(0))},
-                             // axes = [0, 2], keepdims = 1
-                             {MakeAttribute("axes", std::vector<int64_t>{0, 2})},
-                             // axes = [0, 1], keepdims = 0
-                             {MakeAttribute("axes", std::vector<int64_t>{0, 1}),
-                              MakeAttribute("keepdims", int64_t(0))},
-                             // axes = [1], keepdims = 1
-                             {MakeAttribute("axes", std::vector<int64_t>{1}),
-                              MakeAttribute("keepdims", int64_t(1))},
-                             // axes = [2], keepdims = 0
-                             {MakeAttribute("axes", std::vector<int64_t>{2}),
-                              MakeAttribute("keepdims", int64_t(0))},
-                             // axes = [-2], keepdims = 1
-                             {MakeAttribute("axes", std::vector<int64_t>{-2}),
-                              MakeAttribute("keepdims", int64_t(1))},
-                             // axes = [-1, -3], keepdims = 0
-                             {MakeAttribute("axes", std::vector<int64_t>{-1, -3}),
-                              MakeAttribute("keepdims", int64_t(0))}};
-
-  RunReductionTests(op_def, input_x, input_y, attr_vector);
+  RunReductionTests(op_def);
 }
 
 TEST(GradientCheckerTest, ReduceLogSumExpGrad) {
   // Attribute axes supports negative values from opset 11.
   OpDef op_def{"ReduceLogSumExp", kOnnxDomain, 11};
 
-  input_x_vector input_x;
-  for (size_t i = 0; i < 8; i++) {
-    input_x.push_back({{4, 3, 2}});
-  }
-
-  input_y_vector input_y = {{{1, 1, 1}},
-                            {{}},
-                            {{1, 3, 1}},
-                            {{2}},
-                            {{4, 1, 2}},
-                            {{4, 3}},
-                            {{4, 1, 2}},
-                            {{3}}};
-
-  attr_vector attr_vector = {
-                             // default
-                             {},
-                             // axes = [0, 1, 2], keepdims = 0
-                             {MakeAttribute("axes", std::vector<int64_t>{0, 1, 2}),
-                              MakeAttribute("keepdims", int64_t(0))},
-                             // axes = [0, 2], keepdims = 1
-                             {MakeAttribute("axes", std::vector<int64_t>{0, 2})},
-                             // axes = [0, 1], keepdims = 0
-                             {MakeAttribute("axes", std::vector<int64_t>{0, 1}),
-                              MakeAttribute("keepdims", int64_t(0))},
-                             // axes = [1], keepdims = 1
-                             {MakeAttribute("axes", std::vector<int64_t>{1}),
-                              MakeAttribute("keepdims", int64_t(1))},
-                             // axes = [2], keepdims = 0
-                             {MakeAttribute("axes", std::vector<int64_t>{2}),
-                              MakeAttribute("keepdims", int64_t(0))},
-                             // axes = [-2], keepdims = 1
-                             {MakeAttribute("axes", std::vector<int64_t>{-2}),
-                              MakeAttribute("keepdims", int64_t(1))},
-                             // axes = [-1, -3], keepdims = 0
-                             {MakeAttribute("axes", std::vector<int64_t>{-1, -3}),
-                              MakeAttribute("keepdims", int64_t(0))}};
-
-  RunReductionTests(op_def, input_x, input_y, attr_vector);
+  RunReductionTests(op_def);
 }
 
 #ifndef USE_CUDA