Merge branch 'develop' into vs2019_part1

PaddlePaddle · Jan 6, 2022 · 3a4ca51 · 3a4ca51
2 parents 7a921d6 + 89c0877
commit 3a4ca51
Show file tree

Hide file tree

Showing 60 changed files with 1,346 additions and 963 deletions.
diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
@@ -36,7 +36,7 @@ ENDIF()
 
 if(NOT DEFINED XPU_BASE_URL)
   SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
-  SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20211228")
+  SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220104")
 else()
   SET(XPU_BASE_URL "${XPU_BASE_URL}")
 endif()

diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -2441,11 +2441,13 @@ PDNode *patterns::Bfloat16Placement::operator()(
   if (!bfloat16_enabled_op_types.empty()) {
     supported_op_types = bfloat16_enabled_op_types;
   }
+  auto *op_in = pattern->NewNode(op_in_repr())->AsInput();
   auto *op = pattern->NewNode(op_repr())->assert_is_ops(supported_op_types);
   op->assert_more([&](Node *node) {
     return node->Op()->GetAttrIfExists<bool>("use_mkldnn") ||
            node->Op()->Type() == "reshape2";
   });
+  op->LinksFrom({op_in});
   return op;
 }
 

diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -1446,6 +1446,7 @@ struct Bfloat16Placement : public PatternBase {
   PDNode* operator()(
       const std::unordered_set<std::string>& bfloat16_enabled_op_types);
 
+  PATTERN_DECL_NODE(op_in);
   PATTERN_DECL_NODE(op);
 };
 

diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass.cc
@@ -41,8 +41,12 @@ void CPUBfloat16PlacementPass::SetMkldnnDataType(
 
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* g) {
+    GET_IR_NODE_FROM_SUBGRAPH(op_in, op_in, bfloat16_placement_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(op, op, bfloat16_placement_pattern);
 
+    // Only float input can be converted to bfloat16
+    if (op_in->Var()->GetDataType() != proto::VarType::FP32) return;
+
     if ((op->Op()->HasAttr("mkldnn_data_type") ||
          op->Op()->HasProtoAttr("mkldnn_data_type")) &&
         !platform::HasOpINT8DataType(op->Op())) {

diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass_tester.cc
@@ -68,7 +68,7 @@ ProgramDesc BuildProgramDesc() {
   for (auto& v :
        std::vector<std::string>({"a", "b", "c", "f", "g", "h", "k", "l", "m",
                                  "n", "o", "p", "r", "s"})) {
-    prog.MutableBlock(0)->Var(v);
+    prog.MutableBlock(0)->Var(v)->SetDataType(proto::VarType::FP32);
   }
 
   SetOp(&prog, "concat", "concat1", {"a", "b"}, {"c"});
@@ -86,9 +86,8 @@ ProgramDesc BuildProgramDesc() {
 }
 
 void MainTest(std::initializer_list<std::string> bfloat16_enabled_op_types,
-              unsigned expected_bfloat16_data_type_count) {
-  auto prog = BuildProgramDesc();
-
+              unsigned expected_bfloat16_data_type_count,
+              const ProgramDesc& prog) {
   std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
 
   auto pass = PassRegistry::Instance().Get("cpu_bfloat16_placement_pass");
@@ -110,8 +109,8 @@ void MainTest(std::initializer_list<std::string> bfloat16_enabled_op_types,
   EXPECT_EQ(bfloat16_data_type_count, expected_bfloat16_data_type_count);
 }
 
-void DefaultAttrTest(unsigned expected_bfloat16_data_type_count) {
-  auto prog = BuildProgramDesc();
+void DefaultAttrTest(unsigned expected_bfloat16_data_type_count,
+                     const ProgramDesc& prog) {
   std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
   auto pass = PassRegistry::Instance().Get("cpu_bfloat16_placement_pass");
   graph.reset(pass->Apply(graph.release()));
@@ -128,15 +127,39 @@ void DefaultAttrTest(unsigned expected_bfloat16_data_type_count) {
 }
 
 TEST(Bfloat16PlacementPass, enable_all) {
-  MainTest({"conv2d", "pool2d", "gelu", "concat", "sum"}, 8);
+  MainTest({"conv2d", "pool2d", "gelu", "concat", "sum"}, 8,
+           BuildProgramDesc());
 }
 
 TEST(Bfloat16PlacementPass, enabled_conv_and_pool) {
   // 2 conv2d + 2 pool2 - 1 orphaned conv2d
-  MainTest({"conv2d", "pool2d"}, 3);
+  MainTest({"conv2d", "pool2d"}, 3, BuildProgramDesc());
+}
+
+TEST(Bfloat16PlacementPass, default_attr_value) {
+  DefaultAttrTest(10, BuildProgramDesc());
+}
+
+ProgramDesc BuildProgramDescWithDataType() {
+  ProgramDesc prog;
+
+  for (auto& v : std::vector<std::string>({"a", "b", "c", "d", "e"})) {
+    if (v == "a") {
+      prog.MutableBlock(0)->Var(v)->SetDataType(proto::VarType::INT32);
+    } else {
+      prog.MutableBlock(0)->Var(v)->SetDataType(proto::VarType::FP32);
+    }
+  }
+
+  SetOp(&prog, "conv2d", "conv1", {"a"}, {"b"});
+  SetOp(&prog, "pool2d", "pool1", {"b"}, {"c"});
+  SetOp(&prog, "concat", "concat1", {"c", "d"}, {"e"});
+  return prog;
 }
 
-TEST(Bfloat16PlacementPass, default_attr_value) { DefaultAttrTest(10); }
+TEST(Bfloat16PlacementPass, check_data_types) {
+  DefaultAttrTest(2, BuildProgramDescWithDataType());
+}
 
 }  // namespace ir
 }  // namespace framework

diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -726,6 +726,7 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
       auto out_h = BOOST_GET_CONST(int, desc.GetAttr("out_h"));
       auto out_w = BOOST_GET_CONST(int, desc.GetAttr("out_w"));
       if (!(out_h > 0 && out_w > 0)) {
+        if (scale.size() < 2) return false;
         if (scale[0] <= 0.f || scale[1] <= 0.f) {
           VLOG(3) << "scale factor must be greater than 0 if out_h or out_w is "
                      "not set.";

diff --git a/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu b/paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
@@ -85,7 +85,7 @@ class StreamSafeCUDAAllocTest : public ::testing::Test {
       workspaces_.emplace_back(allocation);
     }
 
-    result_ = AllocShared(place_, stream_num_ * workspace_size_);
+    result_ = Alloc(place_, stream_num_ * workspace_size_);
   }
 
   void SingleStreamRun(size_t idx) {
@@ -185,7 +185,7 @@ class StreamSafeCUDAAllocTest : public ::testing::Test {
   platform::CUDAPlace place_;
   std::vector<gpuStream_t> streams_;
   std::vector<std::shared_ptr<Allocation>> workspaces_;
-  std::shared_ptr<Allocation> result_;
+  allocation::AllocationPtr result_;
 };
 
 TEST_F(StreamSafeCUDAAllocTest, CUDAMutilStreamTest) {
@@ -225,22 +225,23 @@ TEST(StreamSafeCUDAAllocInterfaceTest, AllocInterfaceTest) {
 
 TEST(StreamSafeCUDAAllocInterfaceTest, GetAllocatorInterfaceTest) {
   platform::CUDAPlace place = platform::CUDAPlace();
+  size_t alloc_size = 256;
+
+  allocation::AllocationPtr allocation_implicit_stream =
+      Alloc(place, alloc_size);
+  EXPECT_GE(allocation_implicit_stream->size(), alloc_size);
+  void *address = allocation_implicit_stream->ptr();
+  allocation_implicit_stream.reset();
+
   auto &instance = allocation::AllocatorFacade::Instance();
   const std::shared_ptr<Allocator> &allocator = instance.GetAllocator(place);
 
-  size_t alloc_size = 256;
-  std::shared_ptr<Allocation> allocation_from_allocator =
+  allocation::AllocationPtr allocation_from_allocator =
       allocator->Allocate(alloc_size);
   EXPECT_GE(allocation_from_allocator->size(), alloc_size);
-  void *address = allocation_from_allocator->ptr();
+  EXPECT_EQ(allocation_from_allocator->ptr(), address);
   allocation_from_allocator.reset();
 
-  std::shared_ptr<Allocation> allocation_implicit_stream =
-      AllocShared(place, alloc_size);
-  EXPECT_GE(allocation_implicit_stream->size(), alloc_size);
-  EXPECT_EQ(allocation_implicit_stream->ptr(), address);
-  allocation_implicit_stream.reset();
-
   Release(place);
   CheckMemLeak(place);
 }
@@ -347,16 +348,12 @@ TEST(StreamSafeCUDAAllocRetryTest, RetryTest) {
   // so the second alloc will fail and retry
   size_t alloc_size = available_size / 4 * 3;
 
-  std::shared_ptr<Allocation> allocation1 = AllocShared(
-      place, alloc_size,
-      platform::Stream(reinterpret_cast<platform::StreamId>(stream1)));
-  std::shared_ptr<Allocation> allocation2;
+  allocation::AllocationPtr allocation1 = Alloc(place, alloc_size, stream1);
+  allocation::AllocationPtr allocation2;
 
   std::thread th([&allocation2, &place, &stream2, alloc_size]() {
     std::this_thread::sleep_for(std::chrono::seconds(1));
-    allocation2 = AllocShared(
-        place, alloc_size,
-        platform::Stream(reinterpret_cast<platform::StreamId>(stream2)));
+    allocation2 = Alloc(place, alloc_size, stream2);
   });
   allocation1.reset();  // free but not release
   th.join();

diff --git a/paddle/fluid/operators/dropout_impl.cu.h b/paddle/fluid/operators/dropout_impl.cu.h
@@ -34,7 +34,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/platform/aligned_vector.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-#include "paddle/pten/kernels/hybird/cuda/elementwise/elementwise_no_broadcast.cu.h"
+#include "paddle/pten/kernels/funcs/cuda_kernel_config.h"
 
 namespace paddle {
 namespace operators {
@@ -193,7 +193,7 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
     // VectorizedRandomGenerator use curand_uniform4, so we only support
     // vec_size is 4;
     int vec_size = (platform::GetVectorizedSize<T>(x_data) == 4) ? 4 : 1;
-    int block_size = pten::GetThreadsConfig(dev_ctx, x_numel, vec_size);
+    int block_size = pten::funcs::GetThreadsConfig(dev_ctx, x_numel, vec_size);
     int grid_size =
         ((x_numel + vec_size - 1) / vec_size + block_size - 1) / block_size;
 

diff --git a/paddle/fluid/operators/elementwise/elementwise_functor.h b/paddle/fluid/operators/elementwise/elementwise_functor.h
@@ -236,5 +236,47 @@ struct FMinFunctor<int64_t> {
   }
 };
 
+template <typename T>
+struct MulGradFunctor {
+  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a * b; }
+};
+template <typename T>
+struct MulGradFunctor<Complex<T>> {
+  inline HOSTDEVICE Complex<T> operator()(const Complex<T>& a,
+                                          const Complex<T>& b) const {
+    Complex<T> b_conj(b.real, -b.imag);
+    return a * b_conj;
+  }
+};
+
+template <typename InT, typename OutT>
+struct MulGradXYFunctor {
+  inline HOSTDEVICE paddle::framework::Array<OutT, 2> operator()(const InT& a,
+                                                                 const InT& b,
+                                                                 const InT& c) {
+    paddle::framework::Array<OutT, 2> outs;
+    // dx = dout * y
+    outs[0] = a * b;
+    // dy = dout * x
+    outs[1] = a * c;
+    return outs;
+  }
+};
+
+template <typename InT, typename OutT>
+struct MulGradXYFunctor<Complex<InT>, Complex<OutT>> {
+  inline HOSTDEVICE paddle::framework::Array<Complex<OutT>, 2> operator()(
+      const Complex<InT>& a, const Complex<InT>& b, const Complex<InT>& c) {
+    paddle::framework::Array<Complex<OutT>, 2> outs;
+    // dx = dout * y
+    Complex<InT> b_conj(b.real, -b.imag);
+    outs[0] = a * b_conj;
+    // dy = dout * x
+    Complex<InT> c_conj(c.real, -c.imag);
+    outs[1] = a * c_conj;
+    return outs;
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/elementwise_mul_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
 #include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/float16.h"
 
@@ -68,69 +69,41 @@ class ElementwiseMulKernel<platform::CUDADeviceContext, T>
   }
 };
 
-template <typename T>
-static __global__ void SimpleElemwiseMulGradCUDAKernel(const T* x, const T* y,
-                                                       const T* out,
-                                                       const T* dout,
-                                                       int64_t size, T* dx,
-                                                       T* dy) {
-  int col = blockIdx.x * blockDim.x + threadIdx.x;
-
-  while (col < size) {
-    T o = dout[col];
-    dx[col] = y[col] * o;
-    dy[col] = x[col] * o;
-    col += blockDim.x * gridDim.x;
-  }
-}
-
-template <>
-__global__ void SimpleElemwiseMulGradCUDAKernel<plat::complex<float>>(
-    const plat::complex<float>* x, const plat::complex<float>* y,
-    const plat::complex<float>* out, const plat::complex<float>* dout,
-    int64_t size, plat::complex<float>* dx, plat::complex<float>* dy) {
-  int col = blockIdx.x * blockDim.x + threadIdx.x;
-
-  while (col < size) {
-    plat::complex<float> o = dout[col];
-    dx[col] = plat::complex<float>(y[col].real, -y[col].imag) * o;
-    dy[col] = plat::complex<float>(x[col].real, -x[col].imag) * o;
-    col += blockDim.x * gridDim.x;
-  }
-}
-
-template <>
-__global__ void SimpleElemwiseMulGradCUDAKernel<plat::complex<double>>(
-    const plat::complex<double>* x, const plat::complex<double>* y,
-    const plat::complex<double>* out, const plat::complex<double>* dout,
-    int64_t size, plat::complex<double>* dx, plat::complex<double>* dy) {
-  int col = blockIdx.x * blockDim.x + threadIdx.x;
-
-  while (col < size) {
-    plat::complex<double> o = dout[col];
-    dx[col] = plat::complex<double>(y[col].real, -y[col].imag) * o;
-    dy[col] = plat::complex<double>(x[col].real, -x[col].imag) * o;
-    col += blockDim.x * gridDim.x;
-  }
-}
-
 template <typename DeviceContext, typename T>
 typename std::enable_if<
-    std::is_same<DeviceContext, plat::CUDADeviceContext>::value>::type
-elementwise_mul_grad(const framework::ExecutionContext& ctx,
-                     const framework::Tensor* x, const framework::Tensor* y,
-                     const framework::Tensor* out,
-                     const framework::Tensor* dout, framework::Tensor* dx,
-                     framework::Tensor* dy) {
-  dim3 block_size = dim3(ELEMENTWISE_BLOCK_SIZE, 1);
-  auto size = x->numel();
-  dim3 grid_size =
-      dim3((size + ELEMENTWISE_BLOCK_SIZE - 1) / ELEMENTWISE_BLOCK_SIZE, 1);
-  SimpleElemwiseMulGradCUDAKernel<
-      T><<<grid_size, block_size, 0,
-           ctx.template device_context<plat::CUDADeviceContext>().stream()>>>(
-      x->data<T>(), y->data<T>(), out->data<T>(), dout->data<T>(), size,
-      dx->mutable_data<T>(ctx.GetPlace()), dy->mutable_data<T>(ctx.GetPlace()));
+    std::is_same<DeviceContext, platform::CUDADeviceContext>::value>::type
+ElementwiseMulGrad(const framework::ExecutionContext& ctx,
+                   const framework::Tensor* x, const framework::Tensor* y,
+                   const framework::Tensor* out, const framework::Tensor* dout,
+                   framework::Tensor* dx, framework::Tensor* dy) {
+  int axis = ctx.Attr<int>("axis");
+  const auto& dev_ctx =
+      ctx.template device_context<platform::CUDADeviceContext>();
+  const auto place = ctx.GetPlace();
+
+  if (dx != nullptr && dy != nullptr) {
+    dx->mutable_data<T>(place);
+    if (dx->IsSharedBufferWith(*dout)) {
+      dx->clear();
+      dx->mutable_data<T>(x->dims(), place);
+    }
+    std::vector<const framework::Tensor*> ins = {dout, y, x};
+    GetGradXAndYOut<ElementwiseType::kBinary, T>(
+        dev_ctx, place, axis, ins, dout, dx, dy, MulGradXYFunctor<T, T>());
+  } else if (dx != nullptr && dy == nullptr) {
+    dx->mutable_data<T>(place);
+    if (dx->IsSharedBufferWith(*dout)) {
+      dx->clear();
+      dx->mutable_data<T>(x->dims(), place);
+    }
+    std::vector<const framework::Tensor*> ins = {dout, y};
+    GetGradXOrYOut<ElementwiseType::kBinary, T>(dev_ctx, place, axis, ins, dout,
+                                                dx, MulGradFunctor<T>());
+  } else if (dx == nullptr && dy != nullptr) {
+    std::vector<const framework::Tensor*> ins = {dout, x};
+    GetGradXOrYOut<ElementwiseType::kBinary, T>(dev_ctx, place, axis, ins, dout,
+                                                dy, MulGradFunctor<T>());
+  }
 }
 
 }  // namespace operators