Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…

… full_kernel
PaddlePaddle · Nov 2, 2021 · dfb7f37 · dfb7f37 · paddle-bot-old · Nov 2, 2021
2 parents 4d77b09 + 4a7f1a0
commit dfb7f37
Show file tree

Hide file tree

Showing 35 changed files with 1,430 additions and 115 deletions.
diff --git a/.gitignore b/.gitignore
@@ -32,3 +32,6 @@ build_*
 cmake-build-*
 paddle/fluid/operators/distributed/send_recv.proto
 model_test
+
+Testing
+tools/__pycache__
diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -241,13 +241,14 @@ void InterpreterCore::BuildInplace() {
     auto& outputs = instr.Outputs();
     for (auto& pair : in_to_outs) {
       auto iter = inputs.find(pair.first);
-      if (iter != inputs.end()) {
+      if (iter != inputs.end() && !iter->second.empty()) {
         if (BuildInplaceCheckVarIsOnlyInput(iter->second[0])) {
           auto iterout = outputs.find(pair.second);
-          if (iterout != outputs.end()) {
+          if (iterout != outputs.end() && !iterout->second.empty()) {
             auto invar = global_scope_->Var(iter->second[0]);
             auto outvar = global_scope_->Var(iterout->second[0]);
-            if (invar && outvar) {
+            if (invar && outvar && invar->IsType<LoDTensor>() &&
+                outvar->IsType<LoDTensor>()) {
               instr.AddInplace(invar, outvar);
               VLOG(3) << "inplace " << vec_instruction_[i].OpBase()->Type()
                       << " " << global_scope_->GetNameById(iter->second[0])

diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.cc b/paddle/fluid/framework/new_executor/interpretercore_util.cc
@@ -142,8 +142,8 @@ void build_variable_scope(const framework::ProgramDesc& pdesc,
     if (nullptr == var_scope->FindVar(var_name)) {
       var_scope->AddVar(var_desc->Name(), var_desc);
     } else {
-      auto* var_desc = var_scope->VarDesc(var_name);
-      if (nullptr == var_desc) {
+      auto* var_desc_tmp = var_scope->VarDesc(var_name);
+      if (nullptr == var_desc_tmp) {
         VLOG(3) << "update var:" << var_name << " desc from nullptr into "
                 << var_desc;
         var_scope->VarMetaInfo(var_name).vardesc_ = var_desc;
@@ -206,9 +206,22 @@ void apply_device_guard(const OperatorBase* op_base,
       VLOG(3) << "Switch into CPUPlace by device_guard.";
       expected_kernel_key->place_ = platform::CPUPlace();
     } else if (op_device.find("gpu") != std::string::npos &&
-               platform::is_gpu_place(place)) {
-      VLOG(3) << "Switch into " << place << " by device_guard.";
-      expected_kernel_key->place_ = place;
+               (platform::is_gpu_place(place) ||
+                platform::is_npu_place(place))) {
+      // when the Op that only has CPUKernel is assigned to GPU, the CPUKernel
+      // will be executed and a warning will be given at the same time.
+      if (op_base->SupportGPU()) {
+        expected_kernel_key->place_ = place;
+      } else if (op_base->SupportNPU()) {
+        expected_kernel_key->place_ = place;
+      } else {
+        expected_kernel_key->place_ = platform::CPUPlace();
+        LOG_FIRST_N(WARNING, 1)
+            << "Op(" << op_base->Type()
+            << ") has no CUDA implementation. It will be assigned to CPUPlace.";
+      }
+      VLOG(3) << "Switch into " << expected_kernel_key->place_
+              << " by device_guard.";
     } else {
       PADDLE_THROW(
           platform::errors::Fatal("Unsupported current place %s", op_device));

diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.h b/paddle/fluid/framework/new_executor/new_executor_defs.h
@@ -474,6 +474,15 @@ struct VariableMetaInfo {
 // TODO(zhiqiu): Maybe we need to add rwlock for VariableScope?
 class VariableScope : public ScopeBase {
  public:
+  VariableScope() {
+    // for @EMPTY@ variable
+    var_list_.push_back(nullptr);
+    name2id_[kEmptyVarName] = 0;
+    VariableMetaInfo info;
+    info.var_ref_count_ = 0;
+    info.vardesc_ = nullptr;
+    vec_meta_info_.push_back(info);
+  }
   Variable* FindVar(const std::string& name) const {
     auto it = name2id_.find(name);
     if (it != name2id_.end()) {

diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
@@ -99,7 +99,7 @@ if (WITH_GPU OR WITH_ROCM)
     endif()
     op_library(sync_batch_norm_op)
     file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(sync_batch_norm);\n")
-    if ((NOT WIN32) AND (NOT WITH_ROCM) AND (NOT PADDLE_WITH_ARM) AND (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 11.2) )
+    if ((NOT WIN32) AND (NOT WITH_ROCM) AND (NOT PADDLE_WITH_ARM) AND (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 11.3) )
         op_library(sparse_attention_op)
         file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(sparse_attention);\n")
     endif()

diff --git a/paddle/fluid/operators/controlflow/fetch_v2_op.cc b/paddle/fluid/operators/controlflow/fetch_v2_op.cc
@@ -77,12 +77,35 @@ class FetchV2Op : public framework::OperatorWithKernel {
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name, const framework::Tensor &tensor,
       const framework::OpKernelType &expected_kernel_type) const override {
+    if (!tensor.IsInitialized()) {
+      return expected_kernel_type;
+    }
     return framework::OpKernelType(expected_kernel_type.data_type_,
                                    tensor.place(), tensor.layout());
   }
 
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
+    auto *fetch_var = ctx.InputVar("X");
+    if (fetch_var == nullptr) {
+      return framework::OpKernelType(framework::proto::VarType::FP32,
+                                     platform::CPUPlace());
+    }
+
+    if (fetch_var->IsType<framework::LoDTensor>()) {
+      auto &src_item = fetch_var->Get<framework::LoDTensor>();
+      if (!src_item.IsInitialized()) {
+        return framework::OpKernelType(framework::proto::VarType::FP32,
+                                       platform::CPUPlace());
+      }
+    } else {
+      auto &src_item = fetch_var->Get<framework::LoDTensorArray>();
+      if (src_item.empty() || !src_item[0].IsInitialized()) {
+        return framework::OpKernelType(framework::proto::VarType::FP32,
+                                       platform::CPUPlace());
+      }
+    }
+
     return framework::OpKernelType(
         OperatorWithKernel::IndicateVarDataType(ctx, "X"),
         platform::CPUPlace());
@@ -127,6 +150,9 @@ class FetchV2Kernel {
 
     if (fetch_var->IsType<framework::LoDTensor>()) {
       auto &src_item = fetch_var->Get<framework::LoDTensor>();
+      if (!src_item.IsInitialized()) {
+        return;
+      }
       auto *dst_item = &(BOOST_GET(framework::LoDTensor, fetch_list->at(col)));
       bool check_place = platform::is_cpu_place(src_item.place()) ||
                          platform::is_cuda_pinned_place(src_item.place());
@@ -173,9 +199,7 @@ class FetchV2OpProtoMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(true);
     AddComment(R"DOC(
 FetchV2 Operator.
-
 It should not be configured by users directly.
-
 )DOC");
   }
 };

diff --git a/paddle/fluid/operators/matmul_v2_op.h b/paddle/fluid/operators/matmul_v2_op.h
@@ -25,6 +25,11 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/complex_functors.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_sum_op.h"
 
+// only can include the headers in paddle/pten/api dirs
+#include "paddle/pten/api/include/core.h"
+#include "paddle/pten/api/include/linalg.h"
+#include "paddle/pten/hapi/lib/utils/tensor_utils.h"
+
 #if defined(__NVCC__) || defined(__HIPCC__)
 #include "paddle/fluid/operators/reduce_ops/cub_reduce.h"
 #endif
@@ -380,15 +385,17 @@ class MatMulV2Kernel : public framework::OpKernel<T> {
     auto* Out = ctx.Output<Tensor>("Out");
     bool trans_x = ctx.Attr<bool>("trans_x");
     bool trans_y = ctx.Attr<bool>("trans_y");
-    PADDLE_ENFORCE_NE(framework::product(X->dims()), 0,
-                      platform::errors::InvalidArgument(
-                          "The Input(X) dims size must not be equal 0,"
-                          " but reviced dims size is 0. "));
-    PADDLE_ENFORCE_NE(framework::product(Y->dims()), 0,
-                      platform::errors::InvalidArgument(
-                          "The Input(Y) dims size must not be equal 0,"
-                          " but reviced dims size is 0. "));
-    MatMulFunction<DeviceContext, T>(X, Y, Out, trans_x, trans_y, ctx);
+
+    auto& dev_ctx = ctx.device_context<DeviceContext>();
+    Out->mutable_data<T>(X->place());
+
+    auto pt_x = paddle::experimental::MakePtenDenseTensor(*X);
+    auto pt_y = paddle::experimental::MakePtenDenseTensor(*Y);
+    auto pt_out = paddle::experimental::MakePtenDenseTensor(*Out);
+
+    // call new kernel
+    pten::Matmul<T>(dev_ctx, *pt_x.get(), *pt_y.get(), trans_x, trans_y,
+                    pt_out.get());
   }
 };
 

diff --git a/paddle/fluid/platform/dynload/cusparse.h b/paddle/fluid/platform/dynload/cusparse.h
@@ -56,8 +56,8 @@ extern void *cusparse_dso_handle;
 
 CUSPARSE_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP);
 
-// APIs available after CUDA 11.2
-#if CUDA_VERSION >= 11020
+// APIs available after CUDA 11.3
+#if CUDA_VERSION >= 11030
 #define CUSPARSE_ROUTINE_EACH_R2(__macro) \
   __macro(cusparseSDDMM_bufferSize);      \
   __macro(cusparseSDDMM_preprocess);      \

diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
@@ -226,6 +226,23 @@ bool SupportsBfloat16FastPerformance() {
 #endif
 }
 
+bool SupportsInt8() {
+#ifndef PADDLE_WITH_MKLDNN
+  return false;
+#else
+  return (platform::MayIUse(platform::cpu_isa_t::avx2) ||
+          platform::MayIUse(platform::cpu_isa_t::avx512f));
+#endif
+}
+
+bool SupportsVNNI() {
+#ifndef PADDLE_WITH_MKLDNN
+  return false;
+#else
+  return platform::MayIUse(platform::cpu_isa_t::avx512_core_vnni);
+#endif
+}
+
 // According to the input `place` and `dtype`, this function returns a tuple
 // consists of three sets:
 // 1) All operators registered in the Paddle framework.
@@ -2121,6 +2138,8 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("_is_compiled_with_heterps", IsCompiledWithHETERPS);
   m.def("supports_bfloat16", SupportsBfloat16);
   m.def("supports_bfloat16_fast_performance", SupportsBfloat16FastPerformance);
+  m.def("supports_int8", SupportsInt8);
+  m.def("supports_vnni", SupportsVNNI);
   m.def("op_supported_infos", OpSupportedInfos);
   m.def("is_compiled_with_brpc", IsCompiledWithBrpc);
   m.def("is_compiled_with_dist", IsCompiledWithDIST);

diff --git a/paddle/pten/api/include/creation.h b/paddle/pten/api/include/creation.h
@@ -14,5 +14,26 @@
 
 #pragma once
 
+#include "paddle/pten/api/include/infershape.h"
+#include "paddle/pten/hapi/lib/utils/allocator.h"
 #include "paddle/pten/kernels/cpu/creation.h"
 #include "paddle/pten/kernels/cuda/creation.h"
+
+namespace pten {
+
+// TODO(YuanRisheng) This function name should be same as User API name.
+// TODO(zyfncg) Automatic code generation
+template <typename T, typename ContextT>
+DenseTensor FillAnyLike(const ContextT& dev_ctx,
+                        const DenseTensor& x,
+                        const Scalar& val) {
+  auto out_meta = UnchangedInferShape(x.meta());
+  const auto allocator =
+      std::make_shared<paddle::experimental::DefaultAllocator>(
+          dev_ctx.GetPlace());
+  pten::DenseTensor dense_out(allocator, out_meta);
+  FillAnyLike<T>(dev_ctx, x, val, &dense_out);
+  return dense_out;
+}
+
+}  // namespace pten
diff --git a/paddle/pten/api/include/linalg.h b/paddle/pten/api/include/linalg.h
@@ -15,5 +15,24 @@
 #pragma once
 
 // See Note: [ How do we organize the kernel directory ]
+#include "paddle/pten/api/include/infershape.h"
+#include "paddle/pten/hapi/lib/utils/allocator.h"
 #include "paddle/pten/kernels/cpu/linalg.h"
 #include "paddle/pten/kernels/cuda/linalg.h"
+
+namespace pten {
+
+template <typename T, typename ContextT>
+DenseTensor Dot(const ContextT& dev_ctx,
+                const DenseTensor& x,
+                const DenseTensor& y) {
+  auto out_meta = DotInferShape(x.meta(), y.meta());
+  const auto allocator =
+      std::make_shared<paddle::experimental::DefaultAllocator>(
+          dev_ctx.GetPlace());
+  pten::DenseTensor dense_out(allocator, out_meta);
+  Dot<T>(dev_ctx, x, y, &dense_out);
+  return dense_out;
+}
+
+}  // namespace pten
diff --git a/paddle/pten/api/include/manipulation.h b/paddle/pten/api/include/manipulation.h
@@ -15,5 +15,25 @@
 #pragma once
 
 // See Note: [ How do we organize the kernel directory ]
+#include "paddle/pten/api/include/infershape.h"
+#include "paddle/pten/hapi/lib/utils/allocator.h"
 #include "paddle/pten/kernels/cpu/manipulation.h"
 #include "paddle/pten/kernels/cuda/manipulation.h"
+
+namespace pten {
+
+template <typename T, typename ContextT>
+DenseTensor Flatten(const ContextT& dev_ctx,
+                    const DenseTensor& x,
+                    int start_axis,
+                    int stop_axis) {
+  auto out_meta = FlattenInferShape(x.meta(), start_axis, stop_axis);
+  const auto allocator =
+      std::make_shared<paddle::experimental::DefaultAllocator>(
+          dev_ctx.GetPlace());
+  pten::DenseTensor dense_out(allocator, out_meta);
+  Flatten<T>(dev_ctx, x, start_axis, stop_axis, &dense_out);
+  return dense_out;
+}
+
+}  // namespace pten
diff --git a/paddle/pten/api/include/math.h b/paddle/pten/api/include/math.h
@@ -15,5 +15,62 @@ limitations under the License. */
 #pragma once
 
 // See Note: [ How do we organize the kernel directory ]
+#include "paddle/pten/api/include/infershape.h"
+#include "paddle/pten/hapi/lib/utils/allocator.h"
 #include "paddle/pten/kernels/cpu/math.h"
 #include "paddle/pten/kernels/cuda/math.h"
+
+namespace pten {
+
+template <typename T, typename ContextT>
+DenseTensor Sign(const ContextT& dev_ctx, const DenseTensor& x) {
+  auto out_meta = UnchangedInferShape(x.meta());
+  const auto allocator =
+      std::make_shared<paddle::experimental::DefaultAllocator>(
+          dev_ctx.GetPlace());
+  pten::DenseTensor dense_out(allocator, out_meta);
+  Sign<T>(dev_ctx, x, &dense_out);
+  return dense_out;
+}
+
+template <typename T, typename ContextT>
+DenseTensor Mean(const ContextT& dev_ctx, const DenseTensor& x) {
+  auto out_meta = ReductionInferShape(x.meta());
+  const auto allocator =
+      std::make_shared<paddle::experimental::DefaultAllocator>(
+          dev_ctx.GetPlace());
+  pten::DenseTensor dense_out(allocator, out_meta);
+  Mean<T>(dev_ctx, x, &dense_out);
+  return dense_out;
+}
+
+template <typename T, typename ContextT>
+DenseTensor Scale(const ContextT& dev_ctx,
+                  const DenseTensor& x,
+                  float scale,
+                  float bias,
+                  bool bias_after_scale) {
+  auto out_meta = UnchangedInferShape(x.meta());
+  const auto allocator =
+      std::make_shared<paddle::experimental::DefaultAllocator>(
+          dev_ctx.GetPlace());
+  pten::DenseTensor dense_out(allocator, out_meta);
+  Scale<T>(dev_ctx, x, scale, bias, bias_after_scale, &dense_out);
+  return dense_out;
+}
+
+template <typename T, typename ContextT>
+DenseTensor Scale(const ContextT& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& scale,
+                  float bias,
+                  bool bias_after_scale) {
+  auto out_meta = UnchangedInferShape(x.meta());
+  const auto allocator =
+      std::make_shared<paddle::experimental::DefaultAllocator>(
+          dev_ctx.GetPlace());
+  pten::DenseTensor dense_out(allocator, out_meta);
+  ScaleHost<T>(dev_ctx, x, scale, bias, bias_after_scale, &dense_out);
+  return dense_out;
+}
+}  // namespace pten
diff --git a/paddle/pten/hapi/include/linalg.h b/paddle/pten/hapi/include/linalg.h
@@ -21,5 +21,10 @@ namespace experimental {
 
 Tensor dot(const Tensor& x, const Tensor& y);
 
+Tensor matmul(const Tensor& x,
+              const Tensor& y,
+              bool transpose_x,
+              bool transpose_y);
+
 }  // namespace experimental
 }  // namespace paddle