Skip to content

Commit

Permalink
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
Browse files Browse the repository at this point in the history
… full_kernel
  • Loading branch information
zyfncg committed Nov 2, 2021
2 parents 4d77b09 + 4a7f1a0 commit dfb7f37
Show file tree
Hide file tree
Showing 35 changed files with 1,430 additions and 115 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,6 @@ build_*
cmake-build-*
paddle/fluid/operators/distributed/send_recv.proto
model_test

Testing
tools/__pycache__
7 changes: 4 additions & 3 deletions paddle/fluid/framework/new_executor/interpretercore.cc
Original file line number Diff line number Diff line change
Expand Up @@ -241,13 +241,14 @@ void InterpreterCore::BuildInplace() {
auto& outputs = instr.Outputs();
for (auto& pair : in_to_outs) {
auto iter = inputs.find(pair.first);
if (iter != inputs.end()) {
if (iter != inputs.end() && !iter->second.empty()) {
if (BuildInplaceCheckVarIsOnlyInput(iter->second[0])) {
auto iterout = outputs.find(pair.second);
if (iterout != outputs.end()) {
if (iterout != outputs.end() && !iterout->second.empty()) {
auto invar = global_scope_->Var(iter->second[0]);
auto outvar = global_scope_->Var(iterout->second[0]);
if (invar && outvar) {
if (invar && outvar && invar->IsType<LoDTensor>() &&
outvar->IsType<LoDTensor>()) {
instr.AddInplace(invar, outvar);
VLOG(3) << "inplace " << vec_instruction_[i].OpBase()->Type()
<< " " << global_scope_->GetNameById(iter->second[0])
Expand Down
23 changes: 18 additions & 5 deletions paddle/fluid/framework/new_executor/interpretercore_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -142,8 +142,8 @@ void build_variable_scope(const framework::ProgramDesc& pdesc,
if (nullptr == var_scope->FindVar(var_name)) {
var_scope->AddVar(var_desc->Name(), var_desc);
} else {
auto* var_desc = var_scope->VarDesc(var_name);
if (nullptr == var_desc) {
auto* var_desc_tmp = var_scope->VarDesc(var_name);
if (nullptr == var_desc_tmp) {
VLOG(3) << "update var:" << var_name << " desc from nullptr into "
<< var_desc;
var_scope->VarMetaInfo(var_name).vardesc_ = var_desc;
Expand Down Expand Up @@ -206,9 +206,22 @@ void apply_device_guard(const OperatorBase* op_base,
VLOG(3) << "Switch into CPUPlace by device_guard.";
expected_kernel_key->place_ = platform::CPUPlace();
} else if (op_device.find("gpu") != std::string::npos &&
platform::is_gpu_place(place)) {
VLOG(3) << "Switch into " << place << " by device_guard.";
expected_kernel_key->place_ = place;
(platform::is_gpu_place(place) ||
platform::is_npu_place(place))) {
// when the Op that only has CPUKernel is assigned to GPU, the CPUKernel
// will be executed and a warning will be given at the same time.
if (op_base->SupportGPU()) {
expected_kernel_key->place_ = place;
} else if (op_base->SupportNPU()) {
expected_kernel_key->place_ = place;
} else {
expected_kernel_key->place_ = platform::CPUPlace();
LOG_FIRST_N(WARNING, 1)
<< "Op(" << op_base->Type()
<< ") has no CUDA implementation. It will be assigned to CPUPlace.";
}
VLOG(3) << "Switch into " << expected_kernel_key->place_
<< " by device_guard.";
} else {
PADDLE_THROW(
platform::errors::Fatal("Unsupported current place %s", op_device));
Expand Down
9 changes: 9 additions & 0 deletions paddle/fluid/framework/new_executor/new_executor_defs.h
Original file line number Diff line number Diff line change
Expand Up @@ -474,6 +474,15 @@ struct VariableMetaInfo {
// TODO(zhiqiu): Maybe we need to add rwlock for VariableScope?
class VariableScope : public ScopeBase {
public:
VariableScope() {
// for @EMPTY@ variable
var_list_.push_back(nullptr);
name2id_[kEmptyVarName] = 0;
VariableMetaInfo info;
info.var_ref_count_ = 0;
info.vardesc_ = nullptr;
vec_meta_info_.push_back(info);
}
Variable* FindVar(const std::string& name) const {
auto it = name2id_.find(name);
if (it != name2id_.end()) {
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/operators/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ if (WITH_GPU OR WITH_ROCM)
endif()
op_library(sync_batch_norm_op)
file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(sync_batch_norm);\n")
if ((NOT WIN32) AND (NOT WITH_ROCM) AND (NOT PADDLE_WITH_ARM) AND (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 11.2) )
if ((NOT WIN32) AND (NOT WITH_ROCM) AND (NOT PADDLE_WITH_ARM) AND (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 11.3) )
op_library(sparse_attention_op)
file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(sparse_attention);\n")
endif()
Expand Down
28 changes: 26 additions & 2 deletions paddle/fluid/operators/controlflow/fetch_v2_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -77,12 +77,35 @@ class FetchV2Op : public framework::OperatorWithKernel {
framework::OpKernelType GetKernelTypeForVar(
const std::string &var_name, const framework::Tensor &tensor,
const framework::OpKernelType &expected_kernel_type) const override {
if (!tensor.IsInitialized()) {
return expected_kernel_type;
}
return framework::OpKernelType(expected_kernel_type.data_type_,
tensor.place(), tensor.layout());
}

framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
auto *fetch_var = ctx.InputVar("X");
if (fetch_var == nullptr) {
return framework::OpKernelType(framework::proto::VarType::FP32,
platform::CPUPlace());
}

if (fetch_var->IsType<framework::LoDTensor>()) {
auto &src_item = fetch_var->Get<framework::LoDTensor>();
if (!src_item.IsInitialized()) {
return framework::OpKernelType(framework::proto::VarType::FP32,
platform::CPUPlace());
}
} else {
auto &src_item = fetch_var->Get<framework::LoDTensorArray>();
if (src_item.empty() || !src_item[0].IsInitialized()) {
return framework::OpKernelType(framework::proto::VarType::FP32,
platform::CPUPlace());
}
}

return framework::OpKernelType(
OperatorWithKernel::IndicateVarDataType(ctx, "X"),
platform::CPUPlace());
Expand Down Expand Up @@ -127,6 +150,9 @@ class FetchV2Kernel {

if (fetch_var->IsType<framework::LoDTensor>()) {
auto &src_item = fetch_var->Get<framework::LoDTensor>();
if (!src_item.IsInitialized()) {
return;
}
auto *dst_item = &(BOOST_GET(framework::LoDTensor, fetch_list->at(col)));
bool check_place = platform::is_cpu_place(src_item.place()) ||
platform::is_cuda_pinned_place(src_item.place());
Expand Down Expand Up @@ -173,9 +199,7 @@ class FetchV2OpProtoMaker : public framework::OpProtoAndCheckerMaker {
.SetDefault(true);
AddComment(R"DOC(
FetchV2 Operator.
It should not be configured by users directly.
)DOC");
}
};
Expand Down
25 changes: 16 additions & 9 deletions paddle/fluid/operators/matmul_v2_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,11 @@ limitations under the License. */
#include "paddle/fluid/operators/math/complex_functors.h"
#include "paddle/fluid/operators/reduce_ops/reduce_sum_op.h"

// only can include the headers in paddle/pten/api dirs
#include "paddle/pten/api/include/core.h"
#include "paddle/pten/api/include/linalg.h"
#include "paddle/pten/hapi/lib/utils/tensor_utils.h"

#if defined(__NVCC__) || defined(__HIPCC__)
#include "paddle/fluid/operators/reduce_ops/cub_reduce.h"
#endif
Expand Down Expand Up @@ -380,15 +385,17 @@ class MatMulV2Kernel : public framework::OpKernel<T> {
auto* Out = ctx.Output<Tensor>("Out");
bool trans_x = ctx.Attr<bool>("trans_x");
bool trans_y = ctx.Attr<bool>("trans_y");
PADDLE_ENFORCE_NE(framework::product(X->dims()), 0,
platform::errors::InvalidArgument(
"The Input(X) dims size must not be equal 0,"
" but reviced dims size is 0. "));
PADDLE_ENFORCE_NE(framework::product(Y->dims()), 0,
platform::errors::InvalidArgument(
"The Input(Y) dims size must not be equal 0,"
" but reviced dims size is 0. "));
MatMulFunction<DeviceContext, T>(X, Y, Out, trans_x, trans_y, ctx);

auto& dev_ctx = ctx.device_context<DeviceContext>();
Out->mutable_data<T>(X->place());

auto pt_x = paddle::experimental::MakePtenDenseTensor(*X);
auto pt_y = paddle::experimental::MakePtenDenseTensor(*Y);
auto pt_out = paddle::experimental::MakePtenDenseTensor(*Out);

// call new kernel
pten::Matmul<T>(dev_ctx, *pt_x.get(), *pt_y.get(), trans_x, trans_y,
pt_out.get());
}
};

Expand Down
4 changes: 2 additions & 2 deletions paddle/fluid/platform/dynload/cusparse.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,8 @@ extern void *cusparse_dso_handle;

CUSPARSE_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP);

// APIs available after CUDA 11.2
#if CUDA_VERSION >= 11020
// APIs available after CUDA 11.3
#if CUDA_VERSION >= 11030
#define CUSPARSE_ROUTINE_EACH_R2(__macro) \
__macro(cusparseSDDMM_bufferSize); \
__macro(cusparseSDDMM_preprocess); \
Expand Down
19 changes: 19 additions & 0 deletions paddle/fluid/pybind/pybind.cc
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,23 @@ bool SupportsBfloat16FastPerformance() {
#endif
}

bool SupportsInt8() {
#ifndef PADDLE_WITH_MKLDNN
return false;
#else
return (platform::MayIUse(platform::cpu_isa_t::avx2) ||
platform::MayIUse(platform::cpu_isa_t::avx512f));
#endif
}

bool SupportsVNNI() {
#ifndef PADDLE_WITH_MKLDNN
return false;
#else
return platform::MayIUse(platform::cpu_isa_t::avx512_core_vnni);
#endif
}

// According to the input `place` and `dtype`, this function returns a tuple
// consists of three sets:
// 1) All operators registered in the Paddle framework.
Expand Down Expand Up @@ -2121,6 +2138,8 @@ All parameter, weight, gradient are variables in Paddle.
m.def("_is_compiled_with_heterps", IsCompiledWithHETERPS);
m.def("supports_bfloat16", SupportsBfloat16);
m.def("supports_bfloat16_fast_performance", SupportsBfloat16FastPerformance);
m.def("supports_int8", SupportsInt8);
m.def("supports_vnni", SupportsVNNI);
m.def("op_supported_infos", OpSupportedInfos);
m.def("is_compiled_with_brpc", IsCompiledWithBrpc);
m.def("is_compiled_with_dist", IsCompiledWithDIST);
Expand Down
21 changes: 21 additions & 0 deletions paddle/pten/api/include/creation.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,26 @@

#pragma once

#include "paddle/pten/api/include/infershape.h"
#include "paddle/pten/hapi/lib/utils/allocator.h"
#include "paddle/pten/kernels/cpu/creation.h"
#include "paddle/pten/kernels/cuda/creation.h"

namespace pten {

// TODO(YuanRisheng) This function name should be same as User API name.
// TODO(zyfncg) Automatic code generation
template <typename T, typename ContextT>
DenseTensor FillAnyLike(const ContextT& dev_ctx,
const DenseTensor& x,
const Scalar& val) {
auto out_meta = UnchangedInferShape(x.meta());
const auto allocator =
std::make_shared<paddle::experimental::DefaultAllocator>(
dev_ctx.GetPlace());
pten::DenseTensor dense_out(allocator, out_meta);
FillAnyLike<T>(dev_ctx, x, val, &dense_out);
return dense_out;
}

} // namespace pten
19 changes: 19 additions & 0 deletions paddle/pten/api/include/linalg.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,24 @@
#pragma once

// See Note: [ How do we organize the kernel directory ]
#include "paddle/pten/api/include/infershape.h"
#include "paddle/pten/hapi/lib/utils/allocator.h"
#include "paddle/pten/kernels/cpu/linalg.h"
#include "paddle/pten/kernels/cuda/linalg.h"

namespace pten {

template <typename T, typename ContextT>
DenseTensor Dot(const ContextT& dev_ctx,
const DenseTensor& x,
const DenseTensor& y) {
auto out_meta = DotInferShape(x.meta(), y.meta());
const auto allocator =
std::make_shared<paddle::experimental::DefaultAllocator>(
dev_ctx.GetPlace());
pten::DenseTensor dense_out(allocator, out_meta);
Dot<T>(dev_ctx, x, y, &dense_out);
return dense_out;
}

} // namespace pten
20 changes: 20 additions & 0 deletions paddle/pten/api/include/manipulation.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,25 @@
#pragma once

// See Note: [ How do we organize the kernel directory ]
#include "paddle/pten/api/include/infershape.h"
#include "paddle/pten/hapi/lib/utils/allocator.h"
#include "paddle/pten/kernels/cpu/manipulation.h"
#include "paddle/pten/kernels/cuda/manipulation.h"

namespace pten {

template <typename T, typename ContextT>
DenseTensor Flatten(const ContextT& dev_ctx,
const DenseTensor& x,
int start_axis,
int stop_axis) {
auto out_meta = FlattenInferShape(x.meta(), start_axis, stop_axis);
const auto allocator =
std::make_shared<paddle::experimental::DefaultAllocator>(
dev_ctx.GetPlace());
pten::DenseTensor dense_out(allocator, out_meta);
Flatten<T>(dev_ctx, x, start_axis, stop_axis, &dense_out);
return dense_out;
}

} // namespace pten
57 changes: 57 additions & 0 deletions paddle/pten/api/include/math.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,62 @@ limitations under the License. */
#pragma once

// See Note: [ How do we organize the kernel directory ]
#include "paddle/pten/api/include/infershape.h"
#include "paddle/pten/hapi/lib/utils/allocator.h"
#include "paddle/pten/kernels/cpu/math.h"
#include "paddle/pten/kernels/cuda/math.h"

namespace pten {

template <typename T, typename ContextT>
DenseTensor Sign(const ContextT& dev_ctx, const DenseTensor& x) {
auto out_meta = UnchangedInferShape(x.meta());
const auto allocator =
std::make_shared<paddle::experimental::DefaultAllocator>(
dev_ctx.GetPlace());
pten::DenseTensor dense_out(allocator, out_meta);
Sign<T>(dev_ctx, x, &dense_out);
return dense_out;
}

template <typename T, typename ContextT>
DenseTensor Mean(const ContextT& dev_ctx, const DenseTensor& x) {
auto out_meta = ReductionInferShape(x.meta());
const auto allocator =
std::make_shared<paddle::experimental::DefaultAllocator>(
dev_ctx.GetPlace());
pten::DenseTensor dense_out(allocator, out_meta);
Mean<T>(dev_ctx, x, &dense_out);
return dense_out;
}

template <typename T, typename ContextT>
DenseTensor Scale(const ContextT& dev_ctx,
const DenseTensor& x,
float scale,
float bias,
bool bias_after_scale) {
auto out_meta = UnchangedInferShape(x.meta());
const auto allocator =
std::make_shared<paddle::experimental::DefaultAllocator>(
dev_ctx.GetPlace());
pten::DenseTensor dense_out(allocator, out_meta);
Scale<T>(dev_ctx, x, scale, bias, bias_after_scale, &dense_out);
return dense_out;
}

template <typename T, typename ContextT>
DenseTensor Scale(const ContextT& dev_ctx,
const DenseTensor& x,
const DenseTensor& scale,
float bias,
bool bias_after_scale) {
auto out_meta = UnchangedInferShape(x.meta());
const auto allocator =
std::make_shared<paddle::experimental::DefaultAllocator>(
dev_ctx.GetPlace());
pten::DenseTensor dense_out(allocator, out_meta);
ScaleHost<T>(dev_ctx, x, scale, bias, bias_after_scale, &dense_out);
return dense_out;
}
} // namespace pten
5 changes: 5 additions & 0 deletions paddle/pten/hapi/include/linalg.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,5 +21,10 @@ namespace experimental {

Tensor dot(const Tensor& x, const Tensor& y);

Tensor matmul(const Tensor& x,
const Tensor& y,
bool transpose_x,
bool transpose_y);

} // namespace experimental
} // namespace paddle
Loading

1 comment on commit dfb7f37

@paddle-bot-old
Copy link

@paddle-bot-old paddle-bot-old bot commented on dfb7f37 Nov 2, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🕵️ CI failures summary

🔍 PR: #36930 Commit ID: dfb7f37 contains failed CI.

🔹 Failed: PR-CI-Windows-OPENBLAS

test_failed
2021-11-02 15:50:00 The following tests FAILED:
2021-11-02 15:50:00 12 - test_fill_api (Failed)
2021-11-02 15:50:00 569 - test_gradient_clip (Failed)
2021-11-02 15:50:00 891 - test_scale_op (Failed)
2021-11-02 15:50:00 945 - test_sum_op (Failed)
2021-11-02 15:50:00 12 - test_fill_api (Failed)
2021-11-02 15:50:00 574 - test_gradient_clip (Failed)
2021-11-02 15:50:00 900 - test_scale_op (Failed)
2021-11-02 15:50:00 955 - test_sum_op (Failed)
2021-11-02 15:50:00 12 - test_fill_api (Failed)
2021-11-02 15:50:00 574 - test_gradient_clip (Failed)
2021-11-02 15:50:00 900 - test_scale_op (Failed)
2021-11-02 15:50:00 955 - test_sum_op (Failed)
2021-11-02 15:50:00 C:\home\workspace\Paddle\build>goto:eof
2021-11-02 15:50:00 C:\home\workspace\Paddle\build>for /F %# in ('wmic os get localdatetime|findstr 20') do set end=%#
2021-11-02 15:50:00 C:\home\workspace\Paddle\build>set end=20211102155000.400000+480
2021-11-02 15:50:00 C:\home\workspace\Paddle\build>set end=1102155000
2021-11-02 15:50:00 C:\home\workspace\Paddle\build>call :timestamp "1102152938" "1102155000" "1 card TestCases Total"
2021-11-02 15:50:00 C:\home\workspace\Paddle\build>setlocal enabledelayedexpansion
2021-11-02 15:50:00 228578
2021-11-02 15:50:00 "Windows 1 card TestCases Total Time: 1222s"

Please sign in to comment.