Skip to content

Commit

Permalink
Refine device context (#6433)
Browse files Browse the repository at this point in the history
There are mainly following fixes:

- take `DeviceContext` as the template parameter of math functors and OpKernel instead of `Place`
- remove `eigen_device` interface in base class  `DeviceContext`
- remove `GetEigenDevice` interface in `ExecutionContext` and base class `DeviceContext`
- remove unused `platform::EigenDeviceConverter`
- rename `REGISTER_OP_GPU_KERNEL` to `REGISTER_OP_CUDA_KERNEL`
- rename `USE_GPU_ONLY_OP` to `USE_CUDA_ONLY_OP`
  • Loading branch information
QiJune authored Dec 12, 2017
1 parent 7902ad6 commit 61ec0b9
Show file tree
Hide file tree
Showing 319 changed files with 2,624 additions and 2,546 deletions.
12 changes: 6 additions & 6 deletions paddle/framework/op_registry.h
Original file line number Diff line number Diff line change
Expand Up @@ -181,8 +181,8 @@ class OpKernelRegistrar : public Registrar {
return 0; \
}

#define REGISTER_OP_GPU_KERNEL(op_type, ...) \
REGISTER_OP_KERNEL(op_type, GPU, ::paddle::platform::GPUPlace, __VA_ARGS__)
#define REGISTER_OP_CUDA_KERNEL(op_type, ...) \
REGISTER_OP_KERNEL(op_type, CUDA, ::paddle::platform::GPUPlace, __VA_ARGS__)

#define REGISTER_OP_CPU_KERNEL(op_type, ...) \
REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
Expand Down Expand Up @@ -217,7 +217,7 @@ class OpKernelRegistrar : public Registrar {
#else
#define USE_OP_KERNEL(op_type) \
USE_OP_DEVICE_KERNEL(op_type, CPU); \
USE_OP_DEVICE_KERNEL(op_type, GPU)
USE_OP_DEVICE_KERNEL(op_type, CUDA)
#endif

#define USE_NO_KERNEL_OP(op_type) USE_OP_ITSELF(op_type);
Expand All @@ -226,9 +226,9 @@ class OpKernelRegistrar : public Registrar {
USE_OP_ITSELF(op_type); \
USE_OP_DEVICE_KERNEL(op_type, CPU);

#define USE_GPU_ONLY_OP(op_type) \
USE_OP_ITSELF(op_type); \
USE_OP_DEVICE_KERNEL(op_type, GPU)
#define USE_CUDA_ONLY_OP(op_type) \
USE_OP_ITSELF(op_type); \
USE_OP_DEVICE_KERNEL(op_type, CUDA)

#define USE_OP(op_type) \
USE_OP_ITSELF(op_type); \
Expand Down
16 changes: 1 addition & 15 deletions paddle/framework/operator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,20 +22,6 @@ limitations under the License. */
namespace paddle {
namespace framework {

template <>
Eigen::DefaultDevice& ExecutionContext::GetEigenDevice<
platform::CPUPlace, Eigen::DefaultDevice>() const {
return *device_context_.GetEigenDevice<platform::CPUPlace>();
}

#ifdef PADDLE_WITH_CUDA
template <>
Eigen::GpuDevice&
ExecutionContext::GetEigenDevice<platform::GPUPlace, Eigen::GpuDevice>() const {
return *device_context_.GetEigenDevice<platform::GPUPlace>();
}
#endif

std::string OperatorBase::Input(const std::string& name) const {
auto& ins = Inputs(name);
PADDLE_ENFORCE_LE(ins.size(), 1UL,
Expand Down Expand Up @@ -429,7 +415,7 @@ void OperatorWithKernel::Run(const Scope& scope,
}
OpKernelType OperatorWithKernel::GetKernelType(
const ExecutionContext& ctx) const {
return OpKernelType(IndicateDataType(ctx), ctx.device_context());
return OpKernelType(IndicateDataType(ctx), ctx.GetPlace());
}
DataType OperatorWithKernel::IndicateDataType(
const ExecutionContext& ctx) const {
Expand Down
26 changes: 13 additions & 13 deletions paddle/framework/operator.h
Original file line number Diff line number Diff line change
Expand Up @@ -276,17 +276,25 @@ class ExecutionContext {
out_tensor->set_lod(in_tensor.lod());
}

template <typename PlaceType,
typename DeviceType = typename platform::EigenDeviceConverter<
PlaceType>::EigenDeviceType>
DeviceType& GetEigenDevice() const;

platform::Place GetPlace() const { return device_context_.GetPlace(); }

template <typename DeviceContextType>
const DeviceContextType& device_context() const {
return *reinterpret_cast<const DeviceContextType*>(&device_context_);
}

const platform::DeviceContext& device_context() const {
return device_context_;
}

#ifdef PADDLE_WITH_CUDA
const inline platform::CUDADeviceContext& cuda_device_context() const {
PADDLE_ENFORCE(platform::is_gpu_place(device_context_.GetPlace()));
return *reinterpret_cast<const platform::CUDADeviceContext*>(
&device_context_);
}
#endif

//! Get actual name vector for this input.
const std::vector<std::string>& Inputs(const std::string& name) const {
return op_.Inputs(name);
Expand All @@ -297,14 +305,6 @@ class ExecutionContext {
return op_.Outputs(name);
}

#ifdef PADDLE_WITH_CUDA
const inline platform::CUDADeviceContext& cuda_device_context() const {
PADDLE_ENFORCE(platform::is_gpu_place(device_context_.GetPlace()));
return *reinterpret_cast<const platform::CUDADeviceContext*>(
&device_context_);
}
#endif

private:
const OperatorBase& op_;
const Scope& scope_;
Expand Down
2 changes: 1 addition & 1 deletion paddle/framework/operator_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ class OpWithKernelTest : public OperatorWithKernel {
protected:
void InferShape(framework::InferShapeContext* ctx) const override {}
OpKernelType GetKernelType(const ExecutionContext& ctx) const override {
return OpKernelType(DataType::FP32, ctx.device_context());
return OpKernelType(DataType::FP32, ctx.GetPlace());
}
};

Expand Down
2 changes: 1 addition & 1 deletion paddle/operators/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ function(op_library TARGET)
if ("${TARGET}" STREQUAL "nccl_op")
set(pybind_flag 1)
# It's enough to just adding one operator to pybind
file(APPEND ${pybind_file} "USE_GPU_ONLY_OP(ncclAllReduce);\n")
file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(ncclAllReduce);\n")
endif()

# reduce_op contains several operators
Expand Down
2 changes: 1 addition & 1 deletion paddle/operators/accuracy_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ class AccuracyOp : public framework::OperatorWithKernel {
const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Out")->type()),
ctx.device_context());
ctx.GetPlace());
}
};

Expand Down
5 changes: 3 additions & 2 deletions paddle/operators/accuracy_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -104,5 +104,6 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> {

// FIXME(typhoonzero): types of T is for inference data.
// label data is always int64
REGISTER_OP_GPU_KERNEL(accuracy, paddle::operators::AccuracyOpCUDAKernel<float>,
paddle::operators::AccuracyOpCUDAKernel<double>);
REGISTER_OP_CUDA_KERNEL(accuracy,
paddle::operators::AccuracyOpCUDAKernel<float>,
paddle::operators::AccuracyOpCUDAKernel<double>);
2 changes: 1 addition & 1 deletion paddle/operators/accuracy_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ namespace operators {

using Tensor = framework::Tensor;

template <typename Place, typename T>
template <typename DeviceContext, typename T>
class AccuracyKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
Expand Down
21 changes: 11 additions & 10 deletions paddle/operators/activation_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -611,16 +611,17 @@ REGISTER_OP(hard_sigmoid, ops::ActivationOp, ops::HardSigmoidOpMaker,
REGISTER_OP(swish, ops::ActivationOp, ops::SwishOpMaker, swish_grad,
ops::ActivationOpGrad);

#define REGISTER_ACTIVATION_CPU_KERNEL(act_type, functor, grad_functor) \
REGISTER_OP_CPU_KERNEL( \
act_type, \
ops::ActivationKernel<paddle::platform::CPUPlace, ops::functor<float>>, \
ops::ActivationKernel<paddle::platform::CPUPlace, \
ops::functor<double>>); \
REGISTER_OP_CPU_KERNEL( \
act_type##_grad, ops::ActivationGradKernel<paddle::platform::CPUPlace, \
ops::grad_functor<float>>, \
ops::ActivationGradKernel<paddle::platform::CPUPlace, \
#define REGISTER_ACTIVATION_CPU_KERNEL(act_type, functor, grad_functor) \
REGISTER_OP_CPU_KERNEL( \
act_type, ops::ActivationKernel<paddle::platform::CPUDeviceContext, \
ops::functor<float>>, \
ops::ActivationKernel<paddle::platform::CPUDeviceContext, \
ops::functor<double>>); \
REGISTER_OP_CPU_KERNEL( \
act_type##_grad, \
ops::ActivationGradKernel<paddle::platform::CPUDeviceContext, \
ops::grad_functor<float>>, \
ops::ActivationGradKernel<paddle::platform::CPUDeviceContext, \
ops::grad_functor<double>>);

FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CPU_KERNEL);
23 changes: 12 additions & 11 deletions paddle/operators/activation_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,17 @@

namespace ops = paddle::operators;

#define REGISTER_ACTIVATION_GPU_KERNEL(act_type, functor, grad_functor) \
REGISTER_OP_GPU_KERNEL( \
act_type, \
ops::ActivationKernel<paddle::platform::GPUPlace, ops::functor<float>>, \
ops::ActivationKernel<paddle::platform::GPUPlace, \
ops::functor<double>>); \
REGISTER_OP_GPU_KERNEL( \
act_type##_grad, ops::ActivationGradKernel<paddle::platform::GPUPlace, \
ops::grad_functor<float>>, \
ops::ActivationGradKernel<paddle::platform::GPUPlace, \
#define REGISTER_ACTIVATION_CUDA_KERNEL(act_type, functor, grad_functor) \
REGISTER_OP_CUDA_KERNEL( \
act_type, ops::ActivationKernel<paddle::platform::CUDADeviceContext, \
ops::functor<float>>, \
ops::ActivationKernel<paddle::platform::CUDADeviceContext, \
ops::functor<double>>); \
REGISTER_OP_CUDA_KERNEL( \
act_type##_grad, \
ops::ActivationGradKernel<paddle::platform::CUDADeviceContext, \
ops::grad_functor<float>>, \
ops::ActivationGradKernel<paddle::platform::CUDADeviceContext, \
ops::grad_functor<double>>);

FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_GPU_KERNEL);
FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CUDA_KERNEL);
14 changes: 8 additions & 6 deletions paddle/operators/activation_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
namespace paddle {
namespace operators {

template <typename Place, typename Functor>
template <typename DeviceContext, typename Functor>
class ActivationKernel
: public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
public:
Expand All @@ -32,18 +32,19 @@ class ActivationKernel

auto x = framework::EigenVector<T>::Flatten(*X);
auto y = framework::EigenVector<T>::Flatten(*Y);
auto place = context.GetEigenDevice<Place>();
auto* place =
context.template device_context<DeviceContext>().eigen_device();
Functor functor;

auto attrs = functor.GetAttrs();
for (auto& attr : attrs) {
*attr.second = context.Attr<float>(attr.first);
}
functor(place, x, y);
functor(*place, x, y);
}
};

template <typename Place, typename Functor>
template <typename DeviceContext, typename Functor>
class ActivationGradKernel
: public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
public:
Expand All @@ -59,13 +60,14 @@ class ActivationGradKernel
auto x = framework::EigenVector<T>::Flatten(*X);
auto y = framework::EigenVector<T>::Flatten(*Y);
auto dx = framework::EigenVector<T>::Flatten(*dX);
auto place = context.GetEigenDevice<Place>();
auto* place =
context.template device_context<DeviceContext>().eigen_device();
Functor functor;
auto attrs = functor.GetAttrs();
for (auto& attr : attrs) {
*attr.second = context.Attr<float>(attr.first);
}
functor(place, x, y, dy, dx);
functor(*place, x, y, dy, dx);
}
};

Expand Down
4 changes: 2 additions & 2 deletions paddle/operators/adadelta_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -109,5 +109,5 @@ param\_out = param + param\_update
namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(adadelta, ops::AdadeltaOp, ops::AdadeltaOpMaker);
REGISTER_OP_CPU_KERNEL(
adadelta, ops::AdadeltaOpKernel<paddle::platform::CPUPlace, float>,
ops::AdadeltaOpKernel<paddle::platform::CPUPlace, double>);
adadelta, ops::AdadeltaOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::AdadeltaOpKernel<paddle::platform::CPUDeviceContext, double>);
6 changes: 3 additions & 3 deletions paddle/operators/adadelta_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,6 @@
#include "paddle/operators/adadelta_op.h"

namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(
adadelta, ops::AdadeltaOpKernel<paddle::platform::GPUPlace, float>,
ops::AdadeltaOpKernel<paddle::platform::GPUPlace, double>);
REGISTER_OP_CUDA_KERNEL(
adadelta, ops::AdadeltaOpKernel<paddle::platform::CUDADeviceContext, float>,
ops::AdadeltaOpKernel<paddle::platform::CUDADeviceContext, double>);
4 changes: 2 additions & 2 deletions paddle/operators/adadelta_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ limitations under the License. */
namespace paddle {
namespace operators {

template <typename Place, typename T>
template <typename DeviceContext, typename T>
class AdadeltaOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
Expand Down Expand Up @@ -51,7 +51,7 @@ class AdadeltaOpKernel : public framework::OpKernel<T> {
framework::EigenVector<T>::Flatten(*avg_squared_grad_out_tensor);
auto avg_squared_update_out =
framework::EigenVector<T>::Flatten(*avg_squared_update_out_tensor);
auto place = ctx.GetEigenDevice<Place>();
auto& place = *ctx.template device_context<DeviceContext>().eigen_device();

avg_squared_grad_out.device(place) =
rho * avg_squared_grad + (1 - rho) * grad.square();
Expand Down
18 changes: 9 additions & 9 deletions paddle/operators/adagrad_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -100,8 +100,8 @@ size_t FindPos(const std::vector<int64_t>& rows, int64_t value) {
} // namespace

template <typename T>
struct SparseAdagradFunctor<platform::CPUPlace, T> {
void operator()(const platform::DeviceContext& context,
struct SparseAdagradFunctor<platform::CPUDeviceContext, T> {
void operator()(const platform::CPUDeviceContext& context,
const framework::SelectedRows& grad,
const framework::Tensor& learning_rate, T epsilon,
framework::Tensor* moment, framework::Tensor* param) {
Expand All @@ -120,7 +120,7 @@ struct SparseAdagradFunctor<platform::CPUPlace, T> {
{static_cast<int64_t>(merge_rows.size()), grad_width}),
context.GetPlace());

math::SetConstant<platform::CPUPlace, T> constant_functor;
math::SetConstant<platform::CPUDeviceContext, T> constant_functor;
constant_functor(context, grad_merge->mutable_value(), 0.0);

auto* grad_merge_data = grad_merge->mutable_value()->data<T>();
Expand All @@ -144,9 +144,9 @@ struct SparseAdagradFunctor<platform::CPUPlace, T> {
auto gs =
framework::EigenVector<T>::Flatten(*(grad_square->mutable_value()));
auto gm = framework::EigenVector<T>::Flatten(grad_merge->value());
gs.device(*context.GetEigenDevice<platform::CPUPlace>()) = gm * gm;
gs.device(*context.eigen_device()) = gm * gm;

math::SelectedRowsAddToTensor<platform::CPUPlace, T> functor;
math::SelectedRowsAddToTensor<platform::CPUDeviceContext, T> functor;
functor(context, *grad_square, moment);

// 3. update parameter
Expand All @@ -164,13 +164,13 @@ struct SparseAdagradFunctor<platform::CPUPlace, T> {
}
};

template struct SparseAdagradFunctor<platform::CPUPlace, float>;
template struct SparseAdagradFunctor<platform::CPUPlace, double>;
template struct SparseAdagradFunctor<platform::CPUDeviceContext, float>;
template struct SparseAdagradFunctor<platform::CPUDeviceContext, double>;
} // namespace operators
} // namespace paddle

namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(adagrad, ops::AdagradOp, ops::AdagradOpMaker);
REGISTER_OP_CPU_KERNEL(
adagrad, ops::AdagradOpKernel<paddle::platform::CPUPlace, float>,
ops::AdagradOpKernel<paddle::platform::CPUPlace, double>);
adagrad, ops::AdagradOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::AdagradOpKernel<paddle::platform::CPUDeviceContext, double>);
20 changes: 10 additions & 10 deletions paddle/operators/adagrad_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,8 @@ __global__ void SparseAdagradFunctorKernel(const T* grad, const int64_t* rows,
} // namespace

template <typename T>
struct SparseAdagradFunctor<platform::GPUPlace, T> {
void operator()(const platform::DeviceContext& context,
struct SparseAdagradFunctor<platform::CUDADeviceContext, T> {
void operator()(const platform::CUDADeviceContext& context,
const framework::SelectedRows& grad,
const framework::Tensor& learning_rate, T epsilon,
framework::Tensor* moment, framework::Tensor* param) {
Expand All @@ -92,7 +92,7 @@ struct SparseAdagradFunctor<platform::GPUPlace, T> {
{static_cast<int64_t>(merge_rows.size()), grad_width}),
context.GetPlace());

math::SetConstant<platform::GPUPlace, T> constant_functor;
math::SetConstant<platform::CUDADeviceContext, T> constant_functor;
constant_functor(context, grad_merge->mutable_value(), 0.0);

auto* grad_merge_data = grad_merge->mutable_value()->data<T>();
Expand All @@ -119,9 +119,9 @@ struct SparseAdagradFunctor<platform::GPUPlace, T> {
auto gs =
framework::EigenVector<T>::Flatten(*(grad_square->mutable_value()));
auto gm = framework::EigenVector<T>::Flatten(grad_merge->value());
gs.device(*context.GetEigenDevice<platform::GPUPlace>()) = gm * gm;
gs.device(*context.eigen_device()) = gm * gm;

math::SelectedRowsAddToTensor<platform::GPUPlace, T> functor;
math::SelectedRowsAddToTensor<platform::CUDADeviceContext, T> functor;
functor(context, *grad_square, moment);

// 3. update parameter
Expand All @@ -139,13 +139,13 @@ struct SparseAdagradFunctor<platform::GPUPlace, T> {
}
};

template struct SparseAdagradFunctor<platform::GPUPlace, float>;
template struct SparseAdagradFunctor<platform::GPUPlace, double>;
template struct SparseAdagradFunctor<platform::CUDADeviceContext, float>;
template struct SparseAdagradFunctor<platform::CUDADeviceContext, double>;

} // namespace operators
} // namespace paddle

namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(
adagrad, ops::AdagradOpKernel<paddle::platform::GPUPlace, float>,
ops::AdagradOpKernel<paddle::platform::GPUPlace, double>);
REGISTER_OP_CUDA_KERNEL(
adagrad, ops::AdagradOpKernel<paddle::platform::CUDADeviceContext, float>,
ops::AdagradOpKernel<paddle::platform::CUDADeviceContext, double>);
Loading

0 comments on commit 61ec0b9

Please sign in to comment.