From d2a526331287bec93979c2440f8ed1bfbebd97f8 Mon Sep 17 00:00:00 2001 From: Tixxx Date: Tue, 28 Jul 2020 05:19:09 +0000 Subject: [PATCH 1/4] added reducesumlogexp gradient added test fixed type mismatch when calling cudnnreduce kernel fixed python frontend to remove redundant states to match pytorch state dict --- .../providers/cuda/reduction/reduction_ops.cc | 9 ++- .../core/graph/gradient_builder.cc | 29 ++++++++ .../orttraining/core/graph/gradient_builder.h | 1 + .../core/graph/gradient_builder_registry.cc | 1 + orttraining/orttraining/python/ort_trainer.py | 8 ++- .../test/gradient/gradient_ops_test.cc | 68 +++++++++++++++++++ 6 files changed, 113 insertions(+), 3 deletions(-) diff --git a/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc b/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc index 4723246fcff97..28245d9743823 100644 --- a/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc +++ b/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc @@ -396,8 +396,9 @@ Status ReduceComputeCore(CUDAExecutionProvider& cuda_ep, const Tensor& input, Pr } CudnnReduceDescriptor reduce_desc; - if (std::is_same::value) + if (std::is_same::value) { ORT_RETURN_IF_ERROR(reduce_desc.Set(cudnn_reduce_op, CudnnTensor::GetDataType(), ReduceTensorIndices)); + } else ORT_RETURN_IF_ERROR(reduce_desc.Set(cudnn_reduce_op, cudnn_type_X, ReduceTensorIndices)); const auto one = Consts::One; @@ -438,7 +439,11 @@ Status ReduceComputeCore(CUDAExecutionProvider& cuda_ep, const Tensor& input, Pr } else { // Reduce max -- Max/Min will output indices data CudnnReduceDescriptor reduce_max_desc; - ORT_RETURN_IF_ERROR(reduce_max_desc.Set(CUDNN_REDUCE_TENSOR_MAX, cudnn_type_X, CUDNN_REDUCE_TENSOR_NO_INDICES)); + cudnnDataType_t cudnn_reduce_max_type = cudnn_type_X; + if((std::is_same::value)) { + cudnn_reduce_max_type = CUDNN_DATA_FLOAT; + } + ORT_RETURN_IF_ERROR(reduce_max_desc.Set(CUDNN_REDUCE_TENSOR_MAX, cudnn_reduce_max_type, CUDNN_REDUCE_TENSOR_NO_INDICES)); size_t indices_bytes_max = 0; CUDNN_RETURN_IF_ERROR(cudnnGetReductionIndicesSize(cuda_ep.PerThreadCudnnHandle(), reduce_max_desc, input_tensor, output_tensor, &indices_bytes_max)); diff --git a/orttraining/orttraining/core/graph/gradient_builder.cc b/orttraining/orttraining/core/graph/gradient_builder.cc index a823d46ba94fc..cd3073f1c7b1f 100644 --- a/orttraining/orttraining/core/graph/gradient_builder.cc +++ b/orttraining/orttraining/core/graph/gradient_builder.cc @@ -788,6 +788,35 @@ IMPLEMENT_GRADIENT_BUILDER(GetReduceMeanGradient) { return result; } +IMPLEMENT_GRADIENT_BUILDER(GetReduceLogSumExpGradient) { + std::vector result; + auto attributes = SrcNodeAttributes(); + bool keepdims = true; + if (attributes.find("keepdims") != attributes.end() && + attributes.at("keepdims").has_i()) { + keepdims = static_cast(attributes.at("keepdims").i()); + } + + ArgDef grad = GO(0); + if (!keepdims && attributes.find("axes") != attributes.end()) { + std::vector axes_values = RetrieveValues(attributes.at("axes")); + grad = IA("Unsqueezed_Grad"); + result.push_back(NodeDef("Unsqueeze", {GO(0)}, {grad}, {MakeAttribute("axes", axes_values)})); + + result.push_back(NodeDef("Unsqueeze", {O(0)}, {IA("Unsqueezed_Output")}, {MakeAttribute("axes", axes_values)})); + result.push_back(NodeDef("Sub", {I(0), IA("Unsqueezed_Output")}, {IA("Self_Sub_Result")})); + } + else { + result.push_back(NodeDef("Sub", {I(0), O(0)}, {IA("Self_Sub_Result")})); + } + + result.push_back(NodeDef("Exp", {IA("Self_Sub_Result")}, {IA("Self_Sub_Result_Exp")})); + + result.push_back(NodeDef("Mul", {IA("Self_Sub_Result_Exp"), grad}, {GI(0)})); + + return result; +} + IMPLEMENT_GRADIENT_BUILDER(GetReduceSumGradient) { std::vector result; auto attributes = SrcNodeAttributes(); diff --git a/orttraining/orttraining/core/graph/gradient_builder.h b/orttraining/orttraining/core/graph/gradient_builder.h index 9a32e421bf5b3..819c800820ed0 100644 --- a/orttraining/orttraining/core/graph/gradient_builder.h +++ b/orttraining/orttraining/core/graph/gradient_builder.h @@ -25,6 +25,7 @@ DECLARE_GRADIENT_BUILDER(GetMulGradient) DECLARE_GRADIENT_BUILDER(GetDivGradient) DECLARE_GRADIENT_BUILDER(GetReduceMeanGradient) DECLARE_GRADIENT_BUILDER(GetReduceSumGradient) +DECLARE_GRADIENT_BUILDER(GetReduceLogSumExpGradient) DECLARE_GRADIENT_BUILDER(GetPowGradient) DECLARE_GRADIENT_BUILDER(GetConcatGradient) DECLARE_GRADIENT_BUILDER(GetReshapeGradient) diff --git a/orttraining/orttraining/core/graph/gradient_builder_registry.cc b/orttraining/orttraining/core/graph/gradient_builder_registry.cc index 94b4e1e096992..7631ce25eb311 100644 --- a/orttraining/orttraining/core/graph/gradient_builder_registry.cc +++ b/orttraining/orttraining/core/graph/gradient_builder_registry.cc @@ -51,6 +51,7 @@ void GradientBuilderRegistry::RegisterGradientBuilders() { REGISTER_GRADIENT_BUILDER("Pow", GetPowGradient); REGISTER_GRADIENT_BUILDER("ReduceMean", GetReduceMeanGradient); REGISTER_GRADIENT_BUILDER("ReduceSum", GetReduceSumGradient); + REGISTER_GRADIENT_BUILDER("ReduceLogSumExp", GetReduceLogSumExpGradient); REGISTER_GRADIENT_BUILDER("Add", GetAddSubGradient); REGISTER_GRADIENT_BUILDER("Sub", GetAddSubGradient); REGISTER_GRADIENT_BUILDER("Mul", GetMulGradient); diff --git a/orttraining/orttraining/python/ort_trainer.py b/orttraining/orttraining/python/ort_trainer.py index a69c41e312a9d..cbc4a2e8a2e3a 100644 --- a/orttraining/orttraining/python/ort_trainer.py +++ b/orttraining/orttraining/python/ort_trainer.py @@ -773,7 +773,13 @@ def state_dict(self): if n.name not in torch_state: torch_state[n.name] = torch.from_numpy(numpy_helper.to_array(n)) - return torch_state + # Need to remove redundant initializers and name suffices to map back to original torch state names + torch_state_to_return = {} + for name, value in torch_state.items(): + if not (("Moment" in name) or ("Update_Count" in name)): + name = name.replace('_fp16', '') + torch_state_to_return[name] = value + return torch_state_to_return def load_state_dict(self, state_dict, strict=False): # Note: It may happen ONNX model has not yet been initialized diff --git a/orttraining/orttraining/test/gradient/gradient_ops_test.cc b/orttraining/orttraining/test/gradient/gradient_ops_test.cc index 9bc844d46cfe0..c3a14dbb484f0 100644 --- a/orttraining/orttraining/test/gradient/gradient_ops_test.cc +++ b/orttraining/orttraining/test/gradient/gradient_ops_test.cc @@ -546,6 +546,74 @@ TEST(GradientCheckerTest, ReduceSumGrad) { } } +TEST(GradientCheckerTest, ReduceLogSumExpGrad) { + float max_error; + GradientChecker gradient_checker; + // Attribute axes supports negative values from opset 11. + OpDef op_def{"ReduceLogSumExp", kOnnxDomain, 11}; + + // default + { + gradient_checker.ComputeGradientError(op_def, {{4, 3, 2}}, {{1, 1, 1}}, &max_error); + EXPECT_IS_TINY(max_error); + } + + // axes = [0, 1, 2], keepdims = 0 + { + gradient_checker.ComputeGradientError(op_def, {{4, 3, 2}}, {{}}, &max_error, + {MakeAttribute("axes", std::vector{0, 1, 2}), + MakeAttribute("keepdims", int64_t(0))}); + EXPECT_IS_TINY(max_error); + } + + // axes = [0, 2], keepdims = 1 + { + gradient_checker.ComputeGradientError(op_def, {{4, 3, 2}}, {{1, 3, 1}}, &max_error, + {MakeAttribute("axes", std::vector{0, 2})}); + EXPECT_IS_TINY(max_error); + } + + // axes = [0, 1], keepdims = 0 + { + gradient_checker.ComputeGradientError(op_def, {{4, 3, 2}}, {{2}}, &max_error, + {MakeAttribute("axes", std::vector{0, 1}), + MakeAttribute("keepdims", int64_t(0))}); + EXPECT_IS_TINY(max_error); + } + + // axes = [1], keepdims = 1 + { + gradient_checker.ComputeGradientError(op_def, {{4, 3, 2}}, {{4, 1, 2}}, &max_error, + {MakeAttribute("axes", std::vector{1}), + MakeAttribute("keepdims", int64_t(1))}); + EXPECT_IS_TINY(max_error); + } + + // axes = [2], keepdims = 0 + { + gradient_checker.ComputeGradientError(op_def, {{4, 3, 2}}, {{4, 3}}, &max_error, + {MakeAttribute("axes", std::vector{2}), + MakeAttribute("keepdims", int64_t(0))}); + EXPECT_IS_TINY(max_error); + } + + // axes = [-2], keepdims = 1 + { + gradient_checker.ComputeGradientError(op_def, {{4, 3, 2}}, {{4, 1, 2}}, &max_error, + {MakeAttribute("axes", std::vector{-2}), + MakeAttribute("keepdims", int64_t(1))}); + EXPECT_IS_TINY(max_error); + } + + // axes = [-1, -3], keepdims = 0 + { + gradient_checker.ComputeGradientError(op_def, {{4, 3, 2}}, {{3}}, &max_error, + {MakeAttribute("axes", std::vector{-1, -3}), + MakeAttribute("keepdims", int64_t(0))}); + EXPECT_IS_TINY(max_error); + } +} + #ifndef USE_CUDA TEST(GradientCheckerTest, CastGrad) { // A dummy test that cast float to float From 0d49d5dd25b5e2e1bc92b337166544bfe4f1087c Mon Sep 17 00:00:00 2001 From: Tixxx Date: Wed, 29 Jul 2020 04:57:50 +0000 Subject: [PATCH 2/4] PR comments --- .../providers/cuda/reduction/reduction_ops.cc | 5 +- .../core/graph/gradient_builder.cc | 5 + orttraining/orttraining/python/ort_trainer.py | 9 +- .../test/gradient/gradient_op_test_utils.h | 4 + .../test/gradient/gradient_ops_test.cc | 337 ++++++++---------- 5 files changed, 158 insertions(+), 202 deletions(-) diff --git a/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc b/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc index 28245d9743823..b2cefca19651c 100644 --- a/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc +++ b/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc @@ -398,9 +398,10 @@ Status ReduceComputeCore(CUDAExecutionProvider& cuda_ep, const Tensor& input, Pr CudnnReduceDescriptor reduce_desc; if (std::is_same::value) { ORT_RETURN_IF_ERROR(reduce_desc.Set(cudnn_reduce_op, CudnnTensor::GetDataType(), ReduceTensorIndices)); - } - else + } else { ORT_RETURN_IF_ERROR(reduce_desc.Set(cudnn_reduce_op, cudnn_type_X, ReduceTensorIndices)); + } + const auto one = Consts::One; const auto zero = Consts::Zero; CudnnTensor input_tensor; diff --git a/orttraining/orttraining/core/graph/gradient_builder.cc b/orttraining/orttraining/core/graph/gradient_builder.cc index cd3073f1c7b1f..674063aa62695 100644 --- a/orttraining/orttraining/core/graph/gradient_builder.cc +++ b/orttraining/orttraining/core/graph/gradient_builder.cc @@ -788,6 +788,11 @@ IMPLEMENT_GRADIENT_BUILDER(GetReduceMeanGradient) { return result; } +// Reference computation is pytorch's logsumexp_backward +// dx_i = exp(xi) / reduceSum(exp(xi)) +// O(0) = log(reduceSum(exp(xi))) +// Self_Sub_Result = I(0) - O(0) = xi - log(sum(exp(xi))) = log( xi / reduceSum(exp(xi))) +// Gradient computation is re-using output and input from forward op, can be a recomputation candidate. IMPLEMENT_GRADIENT_BUILDER(GetReduceLogSumExpGradient) { std::vector result; auto attributes = SrcNodeAttributes(); diff --git a/orttraining/orttraining/python/ort_trainer.py b/orttraining/orttraining/python/ort_trainer.py index cbc4a2e8a2e3a..c45f04a4561ef 100644 --- a/orttraining/orttraining/python/ort_trainer.py +++ b/orttraining/orttraining/python/ort_trainer.py @@ -628,6 +628,8 @@ def __init__(self, model, loss_fn, model_desc, training_optimizer_name, map_opti self.world_rank = world_rank self.world_size = world_size self.use_mixed_precision = use_mixed_precision + + self.original_model_state_keys = list(model.state_dict().keys()) self.session = None self.device_ = device @@ -775,10 +777,9 @@ def state_dict(self): # Need to remove redundant initializers and name suffices to map back to original torch state names torch_state_to_return = {} - for name, value in torch_state.items(): - if not (("Moment" in name) or ("Update_Count" in name)): - name = name.replace('_fp16', '') - torch_state_to_return[name] = value + for key in self.original_model_state_keys: + if key in torch_state: + torch_state_to_return[key] = torch_state[key] return torch_state_to_return def load_state_dict(self, state_dict, strict=False): diff --git a/orttraining/orttraining/test/gradient/gradient_op_test_utils.h b/orttraining/orttraining/test/gradient/gradient_op_test_utils.h index ad75d061627da..71ab36a3bc4fc 100644 --- a/orttraining/orttraining/test/gradient/gradient_op_test_utils.h +++ b/orttraining/orttraining/test/gradient/gradient_op_test_utils.h @@ -7,6 +7,9 @@ namespace onnxruntime { namespace test { +using input_x_vector = std::vector>; +using input_y_vector = std::vector>; +using attr_vector = std::vector>; class GradientOpTester : public OpTester { public: @@ -39,3 +42,4 @@ class GradientOpTester : public OpTester { }; } // namespace test } // namespace onnxruntime + diff --git a/orttraining/orttraining/test/gradient/gradient_ops_test.cc b/orttraining/orttraining/test/gradient/gradient_ops_test.cc index c3a14dbb484f0..c1f405bb33bb0 100644 --- a/orttraining/orttraining/test/gradient/gradient_ops_test.cc +++ b/orttraining/orttraining/test/gradient/gradient_ops_test.cc @@ -37,6 +37,29 @@ static bool IsErrorWithinTolerance(float error, float tolerance) { #define EXPECT_IS_TINY(max_error) \ EXPECT_IS_TINIER_THAN(max_error, 1.5e-2f) +static void RunReductionTests(const OpDef& op_def, + const input_x_vector& input_x, + const input_y_vector& input_y, + const attr_vector& attr_vector) { + EXPECT_TRUE(input_x.size() == input_y.size()) + << "Input_x vector and input_y vector must contain same number of elements. " + << "Input_x size: " << input_x.size() + << "Input_y size:" << input_y.size(); + EXPECT_TRUE(input_x.size() == attr_vector.size()) + << "Input_x vector and attribute vector must contain same number of elements. " + << "Input_x size: " << input_x.size() + << "attr_vector size:" << attr_vector.size(); + + GradientChecker gradient_checker; + + float max_error; + + for (size_t i = 0; i < input_x.size(); i++) { + max_error = 0; + gradient_checker.ComputeGradientError(op_def, input_x[i], input_y[i], &max_error, attr_vector[i]); + EXPECT_IS_TINY(max_error); + } +} template void GenerateRandomDataWithOneHot( @@ -401,217 +424,138 @@ TEST(GradientCheckerTest, GemmGrad) { } TEST(GradientCheckerTest, ReduceMeanGrad) { - float max_error; - GradientChecker gradient_checker; // Attribute axes supports negative values from opset 11. OpDef op_def{"ReduceMean", kOnnxDomain, 11}; - // default - { - gradient_checker.ComputeGradientError(op_def, {{4, 3, 2}}, {{1, 1, 1}}, &max_error); - EXPECT_IS_TINY(max_error); - } - - // TODO: Fix forward kernel behavior for default axes - // default axes, keepdims = 0 - /* - { - gradient_checker.ComputeGradientError(op_def, {{4, 3, 2}}, {{}}, &max_error, - {MakeAttribute("keepdims", int64_t(0))}); - EXPECT_IS_TINY(max_error); - } - */ - - // axes = [0, 1, 2], keepdims = 0 - { - gradient_checker.ComputeGradientError(op_def, {{4, 3, 2}}, {{}}, &max_error, - {MakeAttribute("axes", std::vector{0, 1, 2}), - MakeAttribute("keepdims", int64_t(0))}); - EXPECT_IS_TINY(max_error); - } - - // axes = [0, 2], keepdims = 1 - { - gradient_checker.ComputeGradientError(op_def, {{4, 3, 2}}, {{1, 3, 1}}, &max_error, - {MakeAttribute("axes", std::vector{0, 2})}); - EXPECT_IS_TINY(max_error); - } - - // axes = [0, 1], keepdims = 0 - { - gradient_checker.ComputeGradientError(op_def, {{4, 3, 2}}, {{2}}, &max_error, - {MakeAttribute("axes", std::vector{0, 1}), - MakeAttribute("keepdims", int64_t(0))}); - EXPECT_IS_TINY(max_error); - } - - // axes = [1], keepdims = 1 - { - gradient_checker.ComputeGradientError(op_def, {{4, 3, 2}}, {{4, 1, 2}}, &max_error, - {MakeAttribute("axes", std::vector{1}), - MakeAttribute("keepdims", int64_t(1))}); - EXPECT_IS_TINY(max_error); - } - - // axes = [2], keepdims = 0 - { - gradient_checker.ComputeGradientError(op_def, {{4, 3, 2}}, {{4, 3}}, &max_error, - {MakeAttribute("axes", std::vector{2}), - MakeAttribute("keepdims", int64_t(0))}); - EXPECT_IS_TINY(max_error); - } - - // axes = [-2], keepdims = 1 - { - gradient_checker.ComputeGradientError(op_def, {{4, 3, 2}}, {{4, 1, 2}}, &max_error, - {MakeAttribute("axes", std::vector{-2}), - MakeAttribute("keepdims", int64_t(1))}); - EXPECT_IS_TINY(max_error); - } - - // axes = [-2, -1], keepdims = 0 - { - gradient_checker.ComputeGradientError(op_def, {{4, 3, 2}}, {{4}}, &max_error, - {MakeAttribute("axes", std::vector{-2, -1}), - MakeAttribute("keepdims", int64_t(0))}); - EXPECT_IS_TINY(max_error); - } + input_x_vector input_x; + for (size_t i = 0; i < 8; i++) { + input_x.push_back({{4, 3, 2}}); + } + + input_y_vector input_y = {{{1, 1, 1}}, + {{}}, + {{1, 3, 1}}, + {{2}}, + {{4, 1, 2}}, + {{4, 3}}, + {{4, 1, 2}}, + {{4}}}; + + attr_vector attr_vector = { + // default + {}, + // axes = [0, 1, 2], keepdims = 0 + {MakeAttribute("axes", std::vector{0, 1, 2}), + MakeAttribute("keepdims", int64_t(0))}, + // axes = [0, 2], keepdims = 1 + {MakeAttribute("axes", std::vector{0, 2})}, + // axes = [0, 1], keepdims = 0 + {MakeAttribute("axes", std::vector{0, 1}), + MakeAttribute("keepdims", int64_t(0))}, + // axes = [1], keepdims = 1 + {MakeAttribute("axes", std::vector{1}), + MakeAttribute("keepdims", int64_t(1))}, + // axes = [2], keepdims = 0 + {MakeAttribute("axes", std::vector{2}), + MakeAttribute("keepdims", int64_t(0))}, + // axes = [-2], keepdims = 1 + {MakeAttribute("axes", std::vector{-2}), + MakeAttribute("keepdims", int64_t(1))}, + // axes = [-2, -1], keepdims = 0 + {MakeAttribute("axes", std::vector{-2, -1}), + MakeAttribute("keepdims", int64_t(0))}}; + + RunReductionTests(op_def, input_x, input_y, attr_vector); } TEST(GradientCheckerTest, ReduceSumGrad) { - float max_error; - GradientChecker gradient_checker; // Attribute axes supports negative values from opset 11. OpDef op_def{"ReduceSum", kOnnxDomain, 11}; - // default - { - gradient_checker.ComputeGradientError(op_def, {{4, 3, 2}}, {{1, 1, 1}}, &max_error); - EXPECT_IS_TINY(max_error); - } - - // axes = [0, 1, 2], keepdims = 0 - { - gradient_checker.ComputeGradientError(op_def, {{4, 3, 2}}, {{}}, &max_error, - {MakeAttribute("axes", std::vector{0, 1, 2}), - MakeAttribute("keepdims", int64_t(0))}); - EXPECT_IS_TINY(max_error); - } - - // axes = [0, 2], keepdims = 1 - { - gradient_checker.ComputeGradientError(op_def, {{4, 3, 2}}, {{1, 3, 1}}, &max_error, - {MakeAttribute("axes", std::vector{0, 2})}); - EXPECT_IS_TINY(max_error); - } - - // axes = [0, 1], keepdims = 0 - { - gradient_checker.ComputeGradientError(op_def, {{4, 3, 2}}, {{2}}, &max_error, - {MakeAttribute("axes", std::vector{0, 1}), - MakeAttribute("keepdims", int64_t(0))}); - EXPECT_IS_TINY(max_error); - } - - // axes = [1], keepdims = 1 - { - gradient_checker.ComputeGradientError(op_def, {{4, 3, 2}}, {{4, 1, 2}}, &max_error, - {MakeAttribute("axes", std::vector{1}), - MakeAttribute("keepdims", int64_t(1))}); - EXPECT_IS_TINY(max_error); - } - - // axes = [2], keepdims = 0 - { - gradient_checker.ComputeGradientError(op_def, {{4, 3, 2}}, {{4, 3}}, &max_error, - {MakeAttribute("axes", std::vector{2}), - MakeAttribute("keepdims", int64_t(0))}); - EXPECT_IS_TINY(max_error); - } - - // axes = [-2], keepdims = 1 - { - gradient_checker.ComputeGradientError(op_def, {{4, 3, 2}}, {{4, 1, 2}}, &max_error, - {MakeAttribute("axes", std::vector{-2}), - MakeAttribute("keepdims", int64_t(1))}); - EXPECT_IS_TINY(max_error); - } - - // axes = [-1, -3], keepdims = 0 - { - gradient_checker.ComputeGradientError(op_def, {{4, 3, 2}}, {{3}}, &max_error, - {MakeAttribute("axes", std::vector{-1, -3}), - MakeAttribute("keepdims", int64_t(0))}); - EXPECT_IS_TINY(max_error); - } + input_x_vector input_x; + for (size_t i = 0; i < 8; i++) { + input_x.push_back({{4, 3, 2}}); + } + + input_y_vector input_y = {{{1, 1, 1}}, + {{}}, + {{1, 3, 1}}, + {{2}}, + {{4, 1, 2}}, + {{4, 3}}, + {{4, 1, 2}}, + {{3}}}; + + attr_vector attr_vector = { + // default + {}, + // axes = [0, 1, 2], keepdims = 0 + {MakeAttribute("axes", std::vector{0, 1, 2}), + MakeAttribute("keepdims", int64_t(0))}, + // axes = [0, 2], keepdims = 1 + {MakeAttribute("axes", std::vector{0, 2})}, + // axes = [0, 1], keepdims = 0 + {MakeAttribute("axes", std::vector{0, 1}), + MakeAttribute("keepdims", int64_t(0))}, + // axes = [1], keepdims = 1 + {MakeAttribute("axes", std::vector{1}), + MakeAttribute("keepdims", int64_t(1))}, + // axes = [2], keepdims = 0 + {MakeAttribute("axes", std::vector{2}), + MakeAttribute("keepdims", int64_t(0))}, + // axes = [-2], keepdims = 1 + {MakeAttribute("axes", std::vector{-2}), + MakeAttribute("keepdims", int64_t(1))}, + // axes = [-1, -3], keepdims = 0 + {MakeAttribute("axes", std::vector{-1, -3}), + MakeAttribute("keepdims", int64_t(0))}}; + + RunReductionTests(op_def, input_x, input_y, attr_vector); } TEST(GradientCheckerTest, ReduceLogSumExpGrad) { - float max_error; - GradientChecker gradient_checker; // Attribute axes supports negative values from opset 11. OpDef op_def{"ReduceLogSumExp", kOnnxDomain, 11}; - // default - { - gradient_checker.ComputeGradientError(op_def, {{4, 3, 2}}, {{1, 1, 1}}, &max_error); - EXPECT_IS_TINY(max_error); - } - - // axes = [0, 1, 2], keepdims = 0 - { - gradient_checker.ComputeGradientError(op_def, {{4, 3, 2}}, {{}}, &max_error, - {MakeAttribute("axes", std::vector{0, 1, 2}), - MakeAttribute("keepdims", int64_t(0))}); - EXPECT_IS_TINY(max_error); - } - - // axes = [0, 2], keepdims = 1 - { - gradient_checker.ComputeGradientError(op_def, {{4, 3, 2}}, {{1, 3, 1}}, &max_error, - {MakeAttribute("axes", std::vector{0, 2})}); - EXPECT_IS_TINY(max_error); - } - - // axes = [0, 1], keepdims = 0 - { - gradient_checker.ComputeGradientError(op_def, {{4, 3, 2}}, {{2}}, &max_error, - {MakeAttribute("axes", std::vector{0, 1}), - MakeAttribute("keepdims", int64_t(0))}); - EXPECT_IS_TINY(max_error); - } - - // axes = [1], keepdims = 1 - { - gradient_checker.ComputeGradientError(op_def, {{4, 3, 2}}, {{4, 1, 2}}, &max_error, - {MakeAttribute("axes", std::vector{1}), - MakeAttribute("keepdims", int64_t(1))}); - EXPECT_IS_TINY(max_error); - } - - // axes = [2], keepdims = 0 - { - gradient_checker.ComputeGradientError(op_def, {{4, 3, 2}}, {{4, 3}}, &max_error, - {MakeAttribute("axes", std::vector{2}), - MakeAttribute("keepdims", int64_t(0))}); - EXPECT_IS_TINY(max_error); - } - - // axes = [-2], keepdims = 1 - { - gradient_checker.ComputeGradientError(op_def, {{4, 3, 2}}, {{4, 1, 2}}, &max_error, - {MakeAttribute("axes", std::vector{-2}), - MakeAttribute("keepdims", int64_t(1))}); - EXPECT_IS_TINY(max_error); - } - - // axes = [-1, -3], keepdims = 0 - { - gradient_checker.ComputeGradientError(op_def, {{4, 3, 2}}, {{3}}, &max_error, - {MakeAttribute("axes", std::vector{-1, -3}), - MakeAttribute("keepdims", int64_t(0))}); - EXPECT_IS_TINY(max_error); - } + input_x_vector input_x; + for (size_t i = 0; i < 8; i++) { + input_x.push_back({{4, 3, 2}}); + } + + input_y_vector input_y = {{{1, 1, 1}}, + {{}}, + {{1, 3, 1}}, + {{2}}, + {{4, 1, 2}}, + {{4, 3}}, + {{4, 1, 2}}, + {{3}}}; + + attr_vector attr_vector = { + // default + {}, + // axes = [0, 1, 2], keepdims = 0 + {MakeAttribute("axes", std::vector{0, 1, 2}), + MakeAttribute("keepdims", int64_t(0))}, + // axes = [0, 2], keepdims = 1 + {MakeAttribute("axes", std::vector{0, 2})}, + // axes = [0, 1], keepdims = 0 + {MakeAttribute("axes", std::vector{0, 1}), + MakeAttribute("keepdims", int64_t(0))}, + // axes = [1], keepdims = 1 + {MakeAttribute("axes", std::vector{1}), + MakeAttribute("keepdims", int64_t(1))}, + // axes = [2], keepdims = 0 + {MakeAttribute("axes", std::vector{2}), + MakeAttribute("keepdims", int64_t(0))}, + // axes = [-2], keepdims = 1 + {MakeAttribute("axes", std::vector{-2}), + MakeAttribute("keepdims", int64_t(1))}, + // axes = [-1, -3], keepdims = 0 + {MakeAttribute("axes", std::vector{-1, -3}), + MakeAttribute("keepdims", int64_t(0))}}; + + RunReductionTests(op_def, input_x, input_y, attr_vector); } #ifndef USE_CUDA @@ -1998,3 +1942,4 @@ TEST(GradientCheckerTest, ExpandGrad) { } // namespace onnxruntime #endif // NDEBUG + From 3e9a72db386f3659fd7122c7408b0b47d593f76a Mon Sep 17 00:00:00 2001 From: Tixxx Date: Wed, 29 Jul 2020 16:44:09 +0000 Subject: [PATCH 3/4] fixed python frontend test failure --- orttraining/orttraining/python/ort_trainer.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/orttraining/orttraining/python/ort_trainer.py b/orttraining/orttraining/python/ort_trainer.py index c45f04a4561ef..4c1dba47c6261 100644 --- a/orttraining/orttraining/python/ort_trainer.py +++ b/orttraining/orttraining/python/ort_trainer.py @@ -628,8 +628,8 @@ def __init__(self, model, loss_fn, model_desc, training_optimizer_name, map_opti self.world_rank = world_rank self.world_size = world_size self.use_mixed_precision = use_mixed_precision - - self.original_model_state_keys = list(model.state_dict().keys()) + + self.original_model_state_keys = list(model.state_dict().keys()) if hasattr(model, 'state_dict') else [] self.session = None self.device_ = device @@ -777,9 +777,12 @@ def state_dict(self): # Need to remove redundant initializers and name suffices to map back to original torch state names torch_state_to_return = {} - for key in self.original_model_state_keys: - if key in torch_state: - torch_state_to_return[key] = torch_state[key] + if self.original_model_state_keys: + for key in self.original_model_state_keys: + if key in torch_state: + torch_state_to_return[key] = torch_state[key] + else: + torch_state_to_return = torch_state return torch_state_to_return def load_state_dict(self, state_dict, strict=False): From 60c517c7fcd62afe3fdc7dbc4b10b023baebbba4 Mon Sep 17 00:00:00 2001 From: Tixxx Date: Wed, 29 Jul 2020 21:13:26 +0000 Subject: [PATCH 4/4] PR comments --- orttraining/orttraining/python/ort_trainer.py | 10 +- .../test/gradient/gradient_op_test_utils.h | 6 +- .../test/gradient/gradient_ops_test.cc | 189 ++++++------------ 3 files changed, 64 insertions(+), 141 deletions(-) diff --git a/orttraining/orttraining/python/ort_trainer.py b/orttraining/orttraining/python/ort_trainer.py index 4c1dba47c6261..b350e3c578c77 100644 --- a/orttraining/orttraining/python/ort_trainer.py +++ b/orttraining/orttraining/python/ort_trainer.py @@ -776,13 +776,9 @@ def state_dict(self): torch_state[n.name] = torch.from_numpy(numpy_helper.to_array(n)) # Need to remove redundant initializers and name suffices to map back to original torch state names - torch_state_to_return = {} - if self.original_model_state_keys: - for key in self.original_model_state_keys: - if key in torch_state: - torch_state_to_return[key] = torch_state[key] - else: - torch_state_to_return = torch_state + torch_state_to_return = {key: torch_state[key] for key in self.original_model_state_keys if key in torch_state} \ + if self.original_model_state_keys \ + else torch_state return torch_state_to_return def load_state_dict(self, state_dict, strict=False): diff --git a/orttraining/orttraining/test/gradient/gradient_op_test_utils.h b/orttraining/orttraining/test/gradient/gradient_op_test_utils.h index 71ab36a3bc4fc..26f56ddbe0fbf 100644 --- a/orttraining/orttraining/test/gradient/gradient_op_test_utils.h +++ b/orttraining/orttraining/test/gradient/gradient_op_test_utils.h @@ -7,9 +7,9 @@ namespace onnxruntime { namespace test { -using input_x_vector = std::vector>; -using input_y_vector = std::vector>; -using attr_vector = std::vector>; +using TestDataVector = std::tuple>, // Input data + std::vector>, // output data + std::vector>>; //attribute class GradientOpTester : public OpTester { public: diff --git a/orttraining/orttraining/test/gradient/gradient_ops_test.cc b/orttraining/orttraining/test/gradient/gradient_ops_test.cc index c1f405bb33bb0..fe9b1ee0e92bd 100644 --- a/orttraining/orttraining/test/gradient/gradient_ops_test.cc +++ b/orttraining/orttraining/test/gradient/gradient_ops_test.cc @@ -37,26 +37,67 @@ static bool IsErrorWithinTolerance(float error, float tolerance) { #define EXPECT_IS_TINY(max_error) \ EXPECT_IS_TINIER_THAN(max_error, 1.5e-2f) -static void RunReductionTests(const OpDef& op_def, - const input_x_vector& input_x, - const input_y_vector& input_y, - const attr_vector& attr_vector) { - EXPECT_TRUE(input_x.size() == input_y.size()) - << "Input_x vector and input_y vector must contain same number of elements. " - << "Input_x size: " << input_x.size() - << "Input_y size:" << input_y.size(); - EXPECT_TRUE(input_x.size() == attr_vector.size()) - << "Input_x vector and attribute vector must contain same number of elements. " - << "Input_x size: " << input_x.size() - << "attr_vector size:" << attr_vector.size(); + +static void RunReductionTests(const OpDef& op_def) { + + TestDataVector test_data( + // Input X + { + {{4, 3, 2}}, + {{4, 3, 2}}, + {{4, 3, 2}}, + {{4, 3, 2}}, + {{4, 3, 2}}, + {{4, 3, 2}}, + {{4, 3, 2}}, + {{4, 3, 2}}, + }, + // Input Y + { + {{1, 1, 1}}, + {{}}, + {{1, 3, 1}}, + {{2}}, + {{4, 1, 2}}, + {{4, 3}}, + {{4, 1, 2}}, + {{4}} + }, + // Attributes + { + // default + {}, + // axes = [0, 1, 2], keepdims = 0 + {MakeAttribute("axes", std::vector{0, 1, 2}), + MakeAttribute("keepdims", int64_t(0))}, + // axes = [0, 2], keepdims = 1 + {MakeAttribute("axes", std::vector{0, 2})}, + // axes = [0, 1], keepdims = 0 + {MakeAttribute("axes", std::vector{0, 1}), + MakeAttribute("keepdims", int64_t(0))}, + // axes = [1], keepdims = 1 + {MakeAttribute("axes", std::vector{1}), + MakeAttribute("keepdims", int64_t(1))}, + // axes = [2], keepdims = 0 + {MakeAttribute("axes", std::vector{2}), + MakeAttribute("keepdims", int64_t(0))}, + // axes = [-2], keepdims = 1 + {MakeAttribute("axes", std::vector{-2}), + MakeAttribute("keepdims", int64_t(1))}, + // axes = [-2, -1], keepdims = 0 + {MakeAttribute("axes", std::vector{-2, -1}), + MakeAttribute("keepdims", int64_t(0))} + }); GradientChecker gradient_checker; float max_error; - for (size_t i = 0; i < input_x.size(); i++) { + for (size_t i = 0; i < std::get<0>(test_data).size(); i++) { max_error = 0; - gradient_checker.ComputeGradientError(op_def, input_x[i], input_y[i], &max_error, attr_vector[i]); + gradient_checker.ComputeGradientError(op_def, std::get<0>(test_data)[i], + std::get<1>(test_data)[i], &max_error, + std::get<2>(test_data)[i]); EXPECT_IS_TINY(max_error); } } @@ -427,135 +468,21 @@ TEST(GradientCheckerTest, ReduceMeanGrad) { // Attribute axes supports negative values from opset 11. OpDef op_def{"ReduceMean", kOnnxDomain, 11}; - input_x_vector input_x; - for (size_t i = 0; i < 8; i++) { - input_x.push_back({{4, 3, 2}}); - } - - input_y_vector input_y = {{{1, 1, 1}}, - {{}}, - {{1, 3, 1}}, - {{2}}, - {{4, 1, 2}}, - {{4, 3}}, - {{4, 1, 2}}, - {{4}}}; - - attr_vector attr_vector = { - // default - {}, - // axes = [0, 1, 2], keepdims = 0 - {MakeAttribute("axes", std::vector{0, 1, 2}), - MakeAttribute("keepdims", int64_t(0))}, - // axes = [0, 2], keepdims = 1 - {MakeAttribute("axes", std::vector{0, 2})}, - // axes = [0, 1], keepdims = 0 - {MakeAttribute("axes", std::vector{0, 1}), - MakeAttribute("keepdims", int64_t(0))}, - // axes = [1], keepdims = 1 - {MakeAttribute("axes", std::vector{1}), - MakeAttribute("keepdims", int64_t(1))}, - // axes = [2], keepdims = 0 - {MakeAttribute("axes", std::vector{2}), - MakeAttribute("keepdims", int64_t(0))}, - // axes = [-2], keepdims = 1 - {MakeAttribute("axes", std::vector{-2}), - MakeAttribute("keepdims", int64_t(1))}, - // axes = [-2, -1], keepdims = 0 - {MakeAttribute("axes", std::vector{-2, -1}), - MakeAttribute("keepdims", int64_t(0))}}; - - RunReductionTests(op_def, input_x, input_y, attr_vector); + RunReductionTests(op_def); } TEST(GradientCheckerTest, ReduceSumGrad) { // Attribute axes supports negative values from opset 11. OpDef op_def{"ReduceSum", kOnnxDomain, 11}; - input_x_vector input_x; - for (size_t i = 0; i < 8; i++) { - input_x.push_back({{4, 3, 2}}); - } - - input_y_vector input_y = {{{1, 1, 1}}, - {{}}, - {{1, 3, 1}}, - {{2}}, - {{4, 1, 2}}, - {{4, 3}}, - {{4, 1, 2}}, - {{3}}}; - - attr_vector attr_vector = { - // default - {}, - // axes = [0, 1, 2], keepdims = 0 - {MakeAttribute("axes", std::vector{0, 1, 2}), - MakeAttribute("keepdims", int64_t(0))}, - // axes = [0, 2], keepdims = 1 - {MakeAttribute("axes", std::vector{0, 2})}, - // axes = [0, 1], keepdims = 0 - {MakeAttribute("axes", std::vector{0, 1}), - MakeAttribute("keepdims", int64_t(0))}, - // axes = [1], keepdims = 1 - {MakeAttribute("axes", std::vector{1}), - MakeAttribute("keepdims", int64_t(1))}, - // axes = [2], keepdims = 0 - {MakeAttribute("axes", std::vector{2}), - MakeAttribute("keepdims", int64_t(0))}, - // axes = [-2], keepdims = 1 - {MakeAttribute("axes", std::vector{-2}), - MakeAttribute("keepdims", int64_t(1))}, - // axes = [-1, -3], keepdims = 0 - {MakeAttribute("axes", std::vector{-1, -3}), - MakeAttribute("keepdims", int64_t(0))}}; - - RunReductionTests(op_def, input_x, input_y, attr_vector); + RunReductionTests(op_def); } TEST(GradientCheckerTest, ReduceLogSumExpGrad) { // Attribute axes supports negative values from opset 11. OpDef op_def{"ReduceLogSumExp", kOnnxDomain, 11}; - input_x_vector input_x; - for (size_t i = 0; i < 8; i++) { - input_x.push_back({{4, 3, 2}}); - } - - input_y_vector input_y = {{{1, 1, 1}}, - {{}}, - {{1, 3, 1}}, - {{2}}, - {{4, 1, 2}}, - {{4, 3}}, - {{4, 1, 2}}, - {{3}}}; - - attr_vector attr_vector = { - // default - {}, - // axes = [0, 1, 2], keepdims = 0 - {MakeAttribute("axes", std::vector{0, 1, 2}), - MakeAttribute("keepdims", int64_t(0))}, - // axes = [0, 2], keepdims = 1 - {MakeAttribute("axes", std::vector{0, 2})}, - // axes = [0, 1], keepdims = 0 - {MakeAttribute("axes", std::vector{0, 1}), - MakeAttribute("keepdims", int64_t(0))}, - // axes = [1], keepdims = 1 - {MakeAttribute("axes", std::vector{1}), - MakeAttribute("keepdims", int64_t(1))}, - // axes = [2], keepdims = 0 - {MakeAttribute("axes", std::vector{2}), - MakeAttribute("keepdims", int64_t(0))}, - // axes = [-2], keepdims = 1 - {MakeAttribute("axes", std::vector{-2}), - MakeAttribute("keepdims", int64_t(1))}, - // axes = [-1, -3], keepdims = 0 - {MakeAttribute("axes", std::vector{-1, -3}), - MakeAttribute("keepdims", int64_t(0))}}; - - RunReductionTests(op_def, input_x, input_y, attr_vector); + RunReductionTests(op_def); } #ifndef USE_CUDA