Add new operation: BroadcastTensorsOp (#33294)

PaddlePaddle · Jun 23, 2021 · affddfa · affddfa
1 parent f9420e8
commit affddfa
Show file tree

Hide file tree

Showing 7 changed files with 963 additions and 1 deletion.
diff --git a/paddle/fluid/operators/broadcast_tensors_op.cc b/paddle/fluid/operators/broadcast_tensors_op.cc
@@ -0,0 +1,253 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/broadcast_tensors_op.h"
+
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/framework/var_type_inference.h"
+
+namespace paddle {
+namespace operators {
+using framework::Tensor;
+using framework::DDim;
+
+class BroadcastTensorsOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", "broadcast_tensors");
+    OP_INOUT_CHECK(ctx->HasOutputs("Out"), "Output", "Out",
+                   "broadcast_tensors");
+
+    int target_rank = 0;
+    const auto& input_dims = ctx->GetInputsDim("X");
+    // 1. Find Output rank = max(Inputs rank)
+    for (const auto& input_ddim : input_dims) {
+      target_rank = std::max(target_rank, input_ddim.size());
+    }
+
+    PADDLE_ENFORCE_GT(
+        target_rank, 0,
+        platform::errors::InvalidArgument(
+            "BroadcastTensorsOp requires at least one input tensor"
+            "to have rank greater than zero"));
+
+    std::vector<int64_t> target_dims(target_rank, 0);
+    // 2. Output dim(axis=x) = max(Inputs dim(axis=x))
+    for (int index = 0; index < target_rank; index++) {
+      // Loop axes in reverse order,
+      // For each axis, take the maximum as target size
+      // Fill size = 1 if shape vector exhausts
+      int target_dim_size = 1;
+      for (const auto& input_ddim : input_dims) {
+        // Reversed order
+        int axis = static_cast<int>(input_ddim.size()) - index - 1;
+        int dim_size = 1;
+        if (axis >= 0) {
+          dim_size = input_ddim[axis];
+        }
+
+        // We performed bcast semantics check at python level
+        // So input tensors should all have legal shape
+        target_dim_size = std::max(target_dim_size, dim_size);
+      }
+      target_dims[target_rank - index - 1] = target_dim_size;
+    }
+
+    // 3. Set Output Dim
+    std::vector<DDim> output_ddims;
+    for (size_t i = 0; i < input_dims.size(); i++) {
+      output_ddims.emplace_back(framework::make_ddim(target_dims));
+    }
+    ctx->SetOutputsDim("Out", output_ddims);
+    ctx->ShareAllLoD("X", /*->*/ "Out");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    // Broadcast semantics enforces all input variables having the same
+    // DataType/VarType
+    // This condition is also checked during VarType Inference
+    // Here we simply copy input type to output
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+  }
+};
+
+class BroadcastTensorsOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "A Varaible list. The shape and data type of the list elements"
+             "should be consistent. Variable can be multi-dimensional Tensor"
+             "or LoDTensor, and data types can be: bool, float16, float32, "
+             "float64, int32, "
+             "int64.")
+        .AsDuplicable();
+    AddOutput("Out",
+              "the sum of input :code:`x`. its shape and data types are "
+              "consistent with :code:`x`.")
+        .AsDuplicable();
+    AddComment(
+        R"DOC(This OP is used to broadcast a vector of inputs 
+                     with Tensor or LoDTensor type, following broadcast semantics.)DOC");
+  }
+};
+
+class BroadcastTensorsOpVarTypeInference : public framework::VarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext* ctx) const override {
+    // We need at least two tensors to satisfy broadcast semantics
+    size_t input_size = ctx->InputSize("X");
+    PADDLE_ENFORCE_GT(
+        input_size, 0,
+        platform::errors::InvalidArgument(
+            "BroadcastTensorsOp should have at least one input variables,"
+            "but only received %d ",
+            input_size));
+
+    // BroadcastTensorsOp takes a vector of variables named "X"
+    // Here we loop through input variables,
+    // and check if their DataType/VarType are the same
+    auto var_type = ctx->GetInputType("X", 0);
+    auto data_type = ctx->GetInputDataType("X", 0);
+    for (size_t ind = 1; ind < input_size; ind++) {
+      auto cur_var_type = ctx->GetInputType("X", ind);
+      PADDLE_ENFORCE_EQ(
+          var_type, cur_var_type,
+          platform::errors::InvalidArgument(
+              "inputs to BroadcastTensorsOp should have the same variable type,"
+              "but detected %d v.s %d ",
+              framework::ToTypeName(var_type),
+              framework::ToTypeName(cur_var_type)));
+
+      auto cur_data_type = ctx->GetInputDataType("X", ind);
+      PADDLE_ENFORCE_EQ(
+          data_type, cur_data_type,
+          platform::errors::InvalidArgument(
+              "inputs to BroadcastTensorsOp should have the same data type,"
+              "but detected %d v.s %d ",
+              framework::ToTypeName(var_type),
+              framework::ToTypeName(cur_var_type)));
+    }
+
+    // Outputs having the same DataType/VarType as inputs
+    ctx->SetOutputType("Out", var_type, framework::ALL_ELEMENTS);
+    ctx->SetOutputDataType("Out", data_type, framework::ALL_ELEMENTS);
+  }
+};
+
+/* ------ BroadcastTensorsGradOp ------ */
+class BroadcastTensorsGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasOutputs(framework::GradVarName("X")), "Output",
+                   "X@grad", "broadcast_tensors");
+    OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", "broadcast_tensors");
+    OP_INOUT_CHECK(ctx->HasInputs(framework::GradVarName("Out")), "Input",
+                   "Out@grad", "broadcast_tensors");
+
+    const auto& forward_input_dims = ctx->GetInputsDim("X");
+    ctx->SetOutputsDim(framework::GradVarName("X"), forward_input_dims);
+    ctx->ShareAllLoD("X", /*->*/ framework::GradVarName("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
+                                       ctx, framework::GradVarName("Out")),
+                                   ctx.device_context());
+  }
+};
+
+template <typename T>
+class BroadcastTensorsGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+  void Apply(GradOpPtr<T> grad_op) const override {
+    grad_op->SetType("broadcast_tensors_grad");
+    // We need "X" only for backward shape inference
+    grad_op->SetInput("X", this->Input("X"));
+    grad_op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    grad_op->SetOutput(framework::GradVarName("X"),
+                       this->InputGrad("X", /* drop_empty_grad */ false));
+    grad_op->SetAttrMap(this->Attrs());
+  }
+};
+
+class BroadcastTensorsGradOpVarTypeInference
+    : public framework::VarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext* ctx) const override {
+    auto var_type = ctx->GetInputType("X", 0);
+    auto data_type = ctx->GetInputDataType("X", 0);
+
+    ctx->SetOutputType(framework::GradVarName("X"), var_type,
+                       framework::ALL_ELEMENTS);
+    ctx->SetOutputDataType(framework::GradVarName("X"), data_type,
+                           framework::ALL_ELEMENTS);
+  }
+};
+
+DECLARE_NO_NEED_BUFFER_VARS_INFERER(BroadcastTensorsGradNoNeedBufVarsInferer,
+                                    "X");
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OPERATOR(broadcast_tensors, ops::BroadcastTensorsOp,
+                  ops::BroadcastTensorsOpMaker,
+                  ops::BroadcastTensorsGradOpMaker<paddle::framework::OpDesc>,
+                  ops::BroadcastTensorsGradOpMaker<paddle::imperative::OpBase>,
+                  ops::BroadcastTensorsOpVarTypeInference);
+
+REGISTER_OPERATOR(broadcast_tensors_grad, ops::BroadcastTensorsGradOp,
+                  ops::BroadcastTensorsGradOpVarTypeInference,
+                  ops::BroadcastTensorsGradNoNeedBufVarsInferer);
+
+REGISTER_OP_CPU_KERNEL(
+    broadcast_tensors,
+    ops::BroadcastTensorsOpKernel<paddle::platform::CPUDeviceContext,
+                                  plat::float16>,
+    ops::BroadcastTensorsOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::BroadcastTensorsOpKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::BroadcastTensorsOpKernel<paddle::platform::CPUDeviceContext, bool>,
+    ops::BroadcastTensorsOpKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::BroadcastTensorsOpKernel<paddle::platform::CPUDeviceContext, int64_t>);
+
+REGISTER_OP_CPU_KERNEL(
+    broadcast_tensors_grad,
+    ops::BroadcastTensorsGradOpKernel<paddle::platform::CPUDeviceContext,
+                                      plat::float16>,
+    ops::BroadcastTensorsGradOpKernel<paddle::platform::CPUDeviceContext,
+                                      float>,
+    ops::BroadcastTensorsGradOpKernel<paddle::platform::CPUDeviceContext,
+                                      double>,
+    ops::BroadcastTensorsGradOpKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::BroadcastTensorsGradOpKernel<paddle::platform::CPUDeviceContext,
+                                      int64_t>);
diff --git a/paddle/fluid/operators/broadcast_tensors_op.cu b/paddle/fluid/operators/broadcast_tensors_op.cu
@@ -0,0 +1,132 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/broadcast_tensors_op.h"
+
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/operators/reduce_ops/cub_reduce.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+using framework::DDim;
+
+template <typename Tout>
+struct IdentityFunctor {
+  HOSTDEVICE explicit inline IdentityFunctor() {}
+
+  template <typename U>
+  HOSTDEVICE inline Tout operator()(const U& x) const {
+    return static_cast<Tout>(x);
+  }
+};
+
+template <typename T>
+class CUDABroadcastTensorsGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    // Find reduce dimensions
+    const auto& in_tensors =
+        context.MultiInput<Tensor>(framework::GradVarName("Out"));
+    auto out_tensors = context.MultiOutput<Tensor>(framework::GradVarName("X"));
+
+    size_t num_ins = in_tensors.size();
+
+    PADDLE_ENFORCE_GT(
+        num_ins, 1,
+        platform::errors::InvalidArgument(
+            "Expected at least 2 input tensors, but only received d%.",
+            in_tensors.size()));
+
+    PADDLE_ENFORCE_EQ(
+        num_ins, out_tensors.size(),
+        platform::errors::InvalidArgument(
+            "BroadcastTensorsOp expects equal number of inputs and outputs,"
+            "but received: %d inputs v.s %d outputs",
+            num_ins, out_tensors.size()));
+
+    // For each In-Out tensor pair,
+    // Prepare and apply broadcast dims array
+    for (size_t i = 0; i < num_ins; i++) {
+      auto* input_tensor = in_tensors[i];
+      auto* output_tensor = out_tensors[i];
+
+      const DDim& input_dims = input_tensor->dims();
+      const DDim& output_dims = output_tensor->dims();
+
+      int in_rank = input_dims.size();
+      int out_rank = output_dims.size();
+
+      // Collect reduce_dims
+      // Example:
+      // dX  = [1,1,1,1]
+      // dOut = [1,1,1,4]
+      //
+      // reduce_dims  = [3] // reduce along the broadcasted axis
+      std::vector<int> reduce_dims_vec;
+      for (int j = 0; j < in_rank; j++) {
+        int out_axis = out_rank - j - 1;
+        int in_axis = in_rank - j - 1;
+
+        if (out_axis < 0 || output_dims[out_axis] != input_dims[in_axis]) {
+          reduce_dims_vec.push_back(in_axis);
+        }
+      }
+
+      bool just_copy = (reduce_dims_vec.size() == 0);
+      output_tensor->mutable_data<T>(context.GetPlace());
+      if (just_copy) {
+        // Turns out to be a No-Op, simply copy tensors
+        framework::TensorCopy(*input_tensor, context.GetPlace(),
+                              context.device_context(), output_tensor);
+      } else {
+        // reduce_sum implementation on CUDA
+        auto stream = context.cuda_device_context().stream();
+        TensorReduce<T, T, cub::Sum, IdentityFunctor<T>>(
+            *input_tensor, output_tensor, reduce_dims_vec, static_cast<T>(0),
+            cub::Sum(), IdentityFunctor<T>(), stream);
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(
+    broadcast_tensors,
+    ops::BroadcastTensorsOpKernel<paddle::platform::CUDADeviceContext,
+                                  plat::float16>,
+    ops::BroadcastTensorsOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::BroadcastTensorsOpKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::BroadcastTensorsOpKernel<paddle::platform::CUDADeviceContext, bool>,
+    ops::BroadcastTensorsOpKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::BroadcastTensorsOpKernel<paddle::platform::CUDADeviceContext,
+                                  int64_t>);
+
+REGISTER_OP_CUDA_KERNEL(broadcast_tensors_grad,
+                        ops::CUDABroadcastTensorsGradOpKernel<plat::float16>,
+                        ops::CUDABroadcastTensorsGradOpKernel<float>,
+                        ops::CUDABroadcastTensorsGradOpKernel<double>,
+                        ops::CUDABroadcastTensorsGradOpKernel<int>,
+                        ops::CUDABroadcastTensorsGradOpKernel<int64_t>);