Skip to content

Commit

Permalink
Merge branch 'develop' into vs2019_part1
Browse files Browse the repository at this point in the history
  • Loading branch information
betterpig committed Jan 6, 2022
2 parents 7a921d6 + 89c0877 commit 3a4ca51
Show file tree
Hide file tree
Showing 60 changed files with 1,346 additions and 963 deletions.
2 changes: 1 addition & 1 deletion cmake/external/xpu.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ ENDIF()

if(NOT DEFINED XPU_BASE_URL)
SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20211228")
SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220104")
else()
SET(XPU_BASE_URL "${XPU_BASE_URL}")
endif()
Expand Down
2 changes: 2 additions & 0 deletions paddle/fluid/framework/ir/graph_pattern_detector.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2441,11 +2441,13 @@ PDNode *patterns::Bfloat16Placement::operator()(
if (!bfloat16_enabled_op_types.empty()) {
supported_op_types = bfloat16_enabled_op_types;
}
auto *op_in = pattern->NewNode(op_in_repr())->AsInput();
auto *op = pattern->NewNode(op_repr())->assert_is_ops(supported_op_types);
op->assert_more([&](Node *node) {
return node->Op()->GetAttrIfExists<bool>("use_mkldnn") ||
node->Op()->Type() == "reshape2";
});
op->LinksFrom({op_in});
return op;
}

Expand Down
1 change: 1 addition & 0 deletions paddle/fluid/framework/ir/graph_pattern_detector.h
Original file line number Diff line number Diff line change
Expand Up @@ -1446,6 +1446,7 @@ struct Bfloat16Placement : public PatternBase {
PDNode* operator()(
const std::unordered_set<std::string>& bfloat16_enabled_op_types);

PATTERN_DECL_NODE(op_in);
PATTERN_DECL_NODE(op);
};

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,12 @@ void CPUBfloat16PlacementPass::SetMkldnnDataType(

auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
Graph* g) {
GET_IR_NODE_FROM_SUBGRAPH(op_in, op_in, bfloat16_placement_pattern);
GET_IR_NODE_FROM_SUBGRAPH(op, op, bfloat16_placement_pattern);

// Only float input can be converted to bfloat16
if (op_in->Var()->GetDataType() != proto::VarType::FP32) return;

if ((op->Op()->HasAttr("mkldnn_data_type") ||
op->Op()->HasProtoAttr("mkldnn_data_type")) &&
!platform::HasOpINT8DataType(op->Op())) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ ProgramDesc BuildProgramDesc() {
for (auto& v :
std::vector<std::string>({"a", "b", "c", "f", "g", "h", "k", "l", "m",
"n", "o", "p", "r", "s"})) {
prog.MutableBlock(0)->Var(v);
prog.MutableBlock(0)->Var(v)->SetDataType(proto::VarType::FP32);
}

SetOp(&prog, "concat", "concat1", {"a", "b"}, {"c"});
Expand All @@ -86,9 +86,8 @@ ProgramDesc BuildProgramDesc() {
}

void MainTest(std::initializer_list<std::string> bfloat16_enabled_op_types,
unsigned expected_bfloat16_data_type_count) {
auto prog = BuildProgramDesc();

unsigned expected_bfloat16_data_type_count,
const ProgramDesc& prog) {
std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));

auto pass = PassRegistry::Instance().Get("cpu_bfloat16_placement_pass");
Expand All @@ -110,8 +109,8 @@ void MainTest(std::initializer_list<std::string> bfloat16_enabled_op_types,
EXPECT_EQ(bfloat16_data_type_count, expected_bfloat16_data_type_count);
}

void DefaultAttrTest(unsigned expected_bfloat16_data_type_count) {
auto prog = BuildProgramDesc();
void DefaultAttrTest(unsigned expected_bfloat16_data_type_count,
const ProgramDesc& prog) {
std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
auto pass = PassRegistry::Instance().Get("cpu_bfloat16_placement_pass");
graph.reset(pass->Apply(graph.release()));
Expand All @@ -128,15 +127,39 @@ void DefaultAttrTest(unsigned expected_bfloat16_data_type_count) {
}

TEST(Bfloat16PlacementPass, enable_all) {
MainTest({"conv2d", "pool2d", "gelu", "concat", "sum"}, 8);
MainTest({"conv2d", "pool2d", "gelu", "concat", "sum"}, 8,
BuildProgramDesc());
}

TEST(Bfloat16PlacementPass, enabled_conv_and_pool) {
// 2 conv2d + 2 pool2 - 1 orphaned conv2d
MainTest({"conv2d", "pool2d"}, 3);
MainTest({"conv2d", "pool2d"}, 3, BuildProgramDesc());
}

TEST(Bfloat16PlacementPass, default_attr_value) {
DefaultAttrTest(10, BuildProgramDesc());
}

ProgramDesc BuildProgramDescWithDataType() {
ProgramDesc prog;

for (auto& v : std::vector<std::string>({"a", "b", "c", "d", "e"})) {
if (v == "a") {
prog.MutableBlock(0)->Var(v)->SetDataType(proto::VarType::INT32);
} else {
prog.MutableBlock(0)->Var(v)->SetDataType(proto::VarType::FP32);
}
}

SetOp(&prog, "conv2d", "conv1", {"a"}, {"b"});
SetOp(&prog, "pool2d", "pool1", {"b"}, {"c"});
SetOp(&prog, "concat", "concat1", {"c", "d"}, {"e"});
return prog;
}

TEST(Bfloat16PlacementPass, default_attr_value) { DefaultAttrTest(10); }
TEST(Bfloat16PlacementPass, check_data_types) {
DefaultAttrTest(2, BuildProgramDescWithDataType());
}

} // namespace ir
} // namespace framework
Expand Down
1 change: 1 addition & 0 deletions paddle/fluid/inference/tensorrt/op_teller.cc
Original file line number Diff line number Diff line change
Expand Up @@ -726,6 +726,7 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
auto out_h = BOOST_GET_CONST(int, desc.GetAttr("out_h"));
auto out_w = BOOST_GET_CONST(int, desc.GetAttr("out_w"));
if (!(out_h > 0 && out_w > 0)) {
if (scale.size() < 2) return false;
if (scale[0] <= 0.f || scale[1] <= 0.f) {
VLOG(3) << "scale factor must be greater than 0 if out_h or out_w is "
"not set.";
Expand Down
33 changes: 15 additions & 18 deletions paddle/fluid/memory/stream_safe_cuda_alloc_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ class StreamSafeCUDAAllocTest : public ::testing::Test {
workspaces_.emplace_back(allocation);
}

result_ = AllocShared(place_, stream_num_ * workspace_size_);
result_ = Alloc(place_, stream_num_ * workspace_size_);
}

void SingleStreamRun(size_t idx) {
Expand Down Expand Up @@ -185,7 +185,7 @@ class StreamSafeCUDAAllocTest : public ::testing::Test {
platform::CUDAPlace place_;
std::vector<gpuStream_t> streams_;
std::vector<std::shared_ptr<Allocation>> workspaces_;
std::shared_ptr<Allocation> result_;
allocation::AllocationPtr result_;
};

TEST_F(StreamSafeCUDAAllocTest, CUDAMutilStreamTest) {
Expand Down Expand Up @@ -225,22 +225,23 @@ TEST(StreamSafeCUDAAllocInterfaceTest, AllocInterfaceTest) {

TEST(StreamSafeCUDAAllocInterfaceTest, GetAllocatorInterfaceTest) {
platform::CUDAPlace place = platform::CUDAPlace();
size_t alloc_size = 256;

allocation::AllocationPtr allocation_implicit_stream =
Alloc(place, alloc_size);
EXPECT_GE(allocation_implicit_stream->size(), alloc_size);
void *address = allocation_implicit_stream->ptr();
allocation_implicit_stream.reset();

auto &instance = allocation::AllocatorFacade::Instance();
const std::shared_ptr<Allocator> &allocator = instance.GetAllocator(place);

size_t alloc_size = 256;
std::shared_ptr<Allocation> allocation_from_allocator =
allocation::AllocationPtr allocation_from_allocator =
allocator->Allocate(alloc_size);
EXPECT_GE(allocation_from_allocator->size(), alloc_size);
void *address = allocation_from_allocator->ptr();
EXPECT_EQ(allocation_from_allocator->ptr(), address);
allocation_from_allocator.reset();

std::shared_ptr<Allocation> allocation_implicit_stream =
AllocShared(place, alloc_size);
EXPECT_GE(allocation_implicit_stream->size(), alloc_size);
EXPECT_EQ(allocation_implicit_stream->ptr(), address);
allocation_implicit_stream.reset();

Release(place);
CheckMemLeak(place);
}
Expand Down Expand Up @@ -347,16 +348,12 @@ TEST(StreamSafeCUDAAllocRetryTest, RetryTest) {
// so the second alloc will fail and retry
size_t alloc_size = available_size / 4 * 3;

std::shared_ptr<Allocation> allocation1 = AllocShared(
place, alloc_size,
platform::Stream(reinterpret_cast<platform::StreamId>(stream1)));
std::shared_ptr<Allocation> allocation2;
allocation::AllocationPtr allocation1 = Alloc(place, alloc_size, stream1);
allocation::AllocationPtr allocation2;

std::thread th([&allocation2, &place, &stream2, alloc_size]() {
std::this_thread::sleep_for(std::chrono::seconds(1));
allocation2 = AllocShared(
place, alloc_size,
platform::Stream(reinterpret_cast<platform::StreamId>(stream2)));
allocation2 = Alloc(place, alloc_size, stream2);
});
allocation1.reset(); // free but not release
th.join();
Expand Down
4 changes: 2 additions & 2 deletions paddle/fluid/operators/dropout_impl.cu.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ limitations under the License. */
#include "paddle/fluid/operators/dropout_op.h"
#include "paddle/fluid/platform/aligned_vector.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/pten/kernels/hybird/cuda/elementwise/elementwise_no_broadcast.cu.h"
#include "paddle/pten/kernels/funcs/cuda_kernel_config.h"

namespace paddle {
namespace operators {
Expand Down Expand Up @@ -193,7 +193,7 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx,
// VectorizedRandomGenerator use curand_uniform4, so we only support
// vec_size is 4;
int vec_size = (platform::GetVectorizedSize<T>(x_data) == 4) ? 4 : 1;
int block_size = pten::GetThreadsConfig(dev_ctx, x_numel, vec_size);
int block_size = pten::funcs::GetThreadsConfig(dev_ctx, x_numel, vec_size);
int grid_size =
((x_numel + vec_size - 1) / vec_size + block_size - 1) / block_size;

Expand Down
42 changes: 42 additions & 0 deletions paddle/fluid/operators/elementwise/elementwise_functor.h
Original file line number Diff line number Diff line change
Expand Up @@ -236,5 +236,47 @@ struct FMinFunctor<int64_t> {
}
};

template <typename T>
struct MulGradFunctor {
inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a * b; }
};
template <typename T>
struct MulGradFunctor<Complex<T>> {
inline HOSTDEVICE Complex<T> operator()(const Complex<T>& a,
const Complex<T>& b) const {
Complex<T> b_conj(b.real, -b.imag);
return a * b_conj;
}
};

template <typename InT, typename OutT>
struct MulGradXYFunctor {
inline HOSTDEVICE paddle::framework::Array<OutT, 2> operator()(const InT& a,
const InT& b,
const InT& c) {
paddle::framework::Array<OutT, 2> outs;
// dx = dout * y
outs[0] = a * b;
// dy = dout * x
outs[1] = a * c;
return outs;
}
};

template <typename InT, typename OutT>
struct MulGradXYFunctor<Complex<InT>, Complex<OutT>> {
inline HOSTDEVICE paddle::framework::Array<Complex<OutT>, 2> operator()(
const Complex<InT>& a, const Complex<InT>& b, const Complex<InT>& c) {
paddle::framework::Array<Complex<OutT>, 2> outs;
// dx = dout * y
Complex<InT> b_conj(b.real, -b.imag);
outs[0] = a * b_conj;
// dy = dout * x
Complex<InT> c_conj(c.real, -c.imag);
outs[1] = a * c_conj;
return outs;
}
};

} // namespace operators
} // namespace paddle
95 changes: 34 additions & 61 deletions paddle/fluid/operators/elementwise/elementwise_mul_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ limitations under the License. */

#include "paddle/fluid/operators/elementwise/elementwise_mul_op.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
#include "paddle/fluid/platform/complex.h"
#include "paddle/fluid/platform/float16.h"

Expand Down Expand Up @@ -68,69 +69,41 @@ class ElementwiseMulKernel<platform::CUDADeviceContext, T>
}
};

template <typename T>
static __global__ void SimpleElemwiseMulGradCUDAKernel(const T* x, const T* y,
const T* out,
const T* dout,
int64_t size, T* dx,
T* dy) {
int col = blockIdx.x * blockDim.x + threadIdx.x;

while (col < size) {
T o = dout[col];
dx[col] = y[col] * o;
dy[col] = x[col] * o;
col += blockDim.x * gridDim.x;
}
}

template <>
__global__ void SimpleElemwiseMulGradCUDAKernel<plat::complex<float>>(
const plat::complex<float>* x, const plat::complex<float>* y,
const plat::complex<float>* out, const plat::complex<float>* dout,
int64_t size, plat::complex<float>* dx, plat::complex<float>* dy) {
int col = blockIdx.x * blockDim.x + threadIdx.x;

while (col < size) {
plat::complex<float> o = dout[col];
dx[col] = plat::complex<float>(y[col].real, -y[col].imag) * o;
dy[col] = plat::complex<float>(x[col].real, -x[col].imag) * o;
col += blockDim.x * gridDim.x;
}
}

template <>
__global__ void SimpleElemwiseMulGradCUDAKernel<plat::complex<double>>(
const plat::complex<double>* x, const plat::complex<double>* y,
const plat::complex<double>* out, const plat::complex<double>* dout,
int64_t size, plat::complex<double>* dx, plat::complex<double>* dy) {
int col = blockIdx.x * blockDim.x + threadIdx.x;

while (col < size) {
plat::complex<double> o = dout[col];
dx[col] = plat::complex<double>(y[col].real, -y[col].imag) * o;
dy[col] = plat::complex<double>(x[col].real, -x[col].imag) * o;
col += blockDim.x * gridDim.x;
}
}

template <typename DeviceContext, typename T>
typename std::enable_if<
std::is_same<DeviceContext, plat::CUDADeviceContext>::value>::type
elementwise_mul_grad(const framework::ExecutionContext& ctx,
const framework::Tensor* x, const framework::Tensor* y,
const framework::Tensor* out,
const framework::Tensor* dout, framework::Tensor* dx,
framework::Tensor* dy) {
dim3 block_size = dim3(ELEMENTWISE_BLOCK_SIZE, 1);
auto size = x->numel();
dim3 grid_size =
dim3((size + ELEMENTWISE_BLOCK_SIZE - 1) / ELEMENTWISE_BLOCK_SIZE, 1);
SimpleElemwiseMulGradCUDAKernel<
T><<<grid_size, block_size, 0,
ctx.template device_context<plat::CUDADeviceContext>().stream()>>>(
x->data<T>(), y->data<T>(), out->data<T>(), dout->data<T>(), size,
dx->mutable_data<T>(ctx.GetPlace()), dy->mutable_data<T>(ctx.GetPlace()));
std::is_same<DeviceContext, platform::CUDADeviceContext>::value>::type
ElementwiseMulGrad(const framework::ExecutionContext& ctx,
const framework::Tensor* x, const framework::Tensor* y,
const framework::Tensor* out, const framework::Tensor* dout,
framework::Tensor* dx, framework::Tensor* dy) {
int axis = ctx.Attr<int>("axis");
const auto& dev_ctx =
ctx.template device_context<platform::CUDADeviceContext>();
const auto place = ctx.GetPlace();

if (dx != nullptr && dy != nullptr) {
dx->mutable_data<T>(place);
if (dx->IsSharedBufferWith(*dout)) {
dx->clear();
dx->mutable_data<T>(x->dims(), place);
}
std::vector<const framework::Tensor*> ins = {dout, y, x};
GetGradXAndYOut<ElementwiseType::kBinary, T>(
dev_ctx, place, axis, ins, dout, dx, dy, MulGradXYFunctor<T, T>());
} else if (dx != nullptr && dy == nullptr) {
dx->mutable_data<T>(place);
if (dx->IsSharedBufferWith(*dout)) {
dx->clear();
dx->mutable_data<T>(x->dims(), place);
}
std::vector<const framework::Tensor*> ins = {dout, y};
GetGradXOrYOut<ElementwiseType::kBinary, T>(dev_ctx, place, axis, ins, dout,
dx, MulGradFunctor<T>());
} else if (dx == nullptr && dy != nullptr) {
std::vector<const framework::Tensor*> ins = {dout, x};
GetGradXOrYOut<ElementwiseType::kBinary, T>(dev_ctx, place, axis, ins, dout,
dy, MulGradFunctor<T>());
}
}

} // namespace operators
Expand Down
Loading

0 comments on commit 3a4ca51

Please sign in to comment.