Refine device context (#6433)

There are mainly following fixes: - take `DeviceContext` as the template parameter of math functors and OpKernel instead of `Place` - remove `eigen_device` interface in base class `DeviceContext` - remove `GetEigenDevice` interface in `ExecutionContext` and base class `DeviceContext` - remove unused `platform::EigenDeviceConverter` - rename `REGISTER_OP_GPU_KERNEL` to `REGISTER_OP_CUDA_KERNEL` - rename `USE_GPU_ONLY_OP` to `USE_CUDA_ONLY_OP`
PaddlePaddle · Dec 12, 2017 · 61ec0b9 · 61ec0b9
1 parent 7902ad6
commit 61ec0b9
Show file tree

Hide file tree

Showing 319 changed files with 2,624 additions and 2,546 deletions.
diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
@@ -181,8 +181,8 @@ class OpKernelRegistrar : public Registrar {
     return 0;                                                             \
   }
 
-#define REGISTER_OP_GPU_KERNEL(op_type, ...) \
-  REGISTER_OP_KERNEL(op_type, GPU, ::paddle::platform::GPUPlace, __VA_ARGS__)
+#define REGISTER_OP_CUDA_KERNEL(op_type, ...) \
+  REGISTER_OP_KERNEL(op_type, CUDA, ::paddle::platform::GPUPlace, __VA_ARGS__)
 
 #define REGISTER_OP_CPU_KERNEL(op_type, ...) \
   REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
@@ -217,7 +217,7 @@ class OpKernelRegistrar : public Registrar {
 #else
 #define USE_OP_KERNEL(op_type)        \
   USE_OP_DEVICE_KERNEL(op_type, CPU); \
-  USE_OP_DEVICE_KERNEL(op_type, GPU)
+  USE_OP_DEVICE_KERNEL(op_type, CUDA)
 #endif
 
 #define USE_NO_KERNEL_OP(op_type) USE_OP_ITSELF(op_type);
@@ -226,9 +226,9 @@ class OpKernelRegistrar : public Registrar {
   USE_OP_ITSELF(op_type);        \
   USE_OP_DEVICE_KERNEL(op_type, CPU);
 
-#define USE_GPU_ONLY_OP(op_type) \
-  USE_OP_ITSELF(op_type);        \
-  USE_OP_DEVICE_KERNEL(op_type, GPU)
+#define USE_CUDA_ONLY_OP(op_type) \
+  USE_OP_ITSELF(op_type);         \
+  USE_OP_DEVICE_KERNEL(op_type, CUDA)
 
 #define USE_OP(op_type)   \
   USE_OP_ITSELF(op_type); \

diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
@@ -22,20 +22,6 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-template <>
-Eigen::DefaultDevice& ExecutionContext::GetEigenDevice<
-    platform::CPUPlace, Eigen::DefaultDevice>() const {
-  return *device_context_.GetEigenDevice<platform::CPUPlace>();
-}
-
-#ifdef PADDLE_WITH_CUDA
-template <>
-Eigen::GpuDevice&
-ExecutionContext::GetEigenDevice<platform::GPUPlace, Eigen::GpuDevice>() const {
-  return *device_context_.GetEigenDevice<platform::GPUPlace>();
-}
-#endif
-
 std::string OperatorBase::Input(const std::string& name) const {
   auto& ins = Inputs(name);
   PADDLE_ENFORCE_LE(ins.size(), 1UL,
@@ -429,7 +415,7 @@ void OperatorWithKernel::Run(const Scope& scope,
 }
 OpKernelType OperatorWithKernel::GetKernelType(
     const ExecutionContext& ctx) const {
-  return OpKernelType(IndicateDataType(ctx), ctx.device_context());
+  return OpKernelType(IndicateDataType(ctx), ctx.GetPlace());
 }
 DataType OperatorWithKernel::IndicateDataType(
     const ExecutionContext& ctx) const {

diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
@@ -276,17 +276,25 @@ class ExecutionContext {
     out_tensor->set_lod(in_tensor.lod());
   }
 
-  template <typename PlaceType,
-            typename DeviceType = typename platform::EigenDeviceConverter<
-                PlaceType>::EigenDeviceType>
-  DeviceType& GetEigenDevice() const;
-
   platform::Place GetPlace() const { return device_context_.GetPlace(); }
 
+  template <typename DeviceContextType>
+  const DeviceContextType& device_context() const {
+    return *reinterpret_cast<const DeviceContextType*>(&device_context_);
+  }
+
   const platform::DeviceContext& device_context() const {
     return device_context_;
   }
 
+#ifdef PADDLE_WITH_CUDA
+  const inline platform::CUDADeviceContext& cuda_device_context() const {
+    PADDLE_ENFORCE(platform::is_gpu_place(device_context_.GetPlace()));
+    return *reinterpret_cast<const platform::CUDADeviceContext*>(
+        &device_context_);
+  }
+#endif
+
   //! Get actual name vector for this input.
   const std::vector<std::string>& Inputs(const std::string& name) const {
     return op_.Inputs(name);
@@ -297,14 +305,6 @@ class ExecutionContext {
     return op_.Outputs(name);
   }
 
-#ifdef PADDLE_WITH_CUDA
-  const inline platform::CUDADeviceContext& cuda_device_context() const {
-    PADDLE_ENFORCE(platform::is_gpu_place(device_context_.GetPlace()));
-    return *reinterpret_cast<const platform::CUDADeviceContext*>(
-        &device_context_);
-  }
-#endif
-
  private:
   const OperatorBase& op_;
   const Scope& scope_;

diff --git a/paddle/framework/operator_test.cc b/paddle/framework/operator_test.cc
@@ -115,7 +115,7 @@ class OpWithKernelTest : public OperatorWithKernel {
  protected:
   void InferShape(framework::InferShapeContext* ctx) const override {}
   OpKernelType GetKernelType(const ExecutionContext& ctx) const override {
-    return OpKernelType(DataType::FP32, ctx.device_context());
+    return OpKernelType(DataType::FP32, ctx.GetPlace());
   }
 };
 

diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
@@ -138,7 +138,7 @@ function(op_library TARGET)
     if ("${TARGET}" STREQUAL "nccl_op")
         set(pybind_flag 1)
         # It's enough to just adding one operator to pybind
-        file(APPEND ${pybind_file} "USE_GPU_ONLY_OP(ncclAllReduce);\n")
+        file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(ncclAllReduce);\n")
     endif()
 
     # reduce_op contains several operators

diff --git a/paddle/operators/accuracy_op.cc b/paddle/operators/accuracy_op.cc
@@ -57,7 +57,7 @@ class AccuracyOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<Tensor>("Out")->type()),
-        ctx.device_context());
+        ctx.GetPlace());
   }
 };
 

diff --git a/paddle/operators/accuracy_op.cu b/paddle/operators/accuracy_op.cu
@@ -104,5 +104,6 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
 
 // FIXME(typhoonzero): types of T is for inference data.
 // label data is always int64
-REGISTER_OP_GPU_KERNEL(accuracy, paddle::operators::AccuracyOpCUDAKernel<float>,
-                       paddle::operators::AccuracyOpCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(accuracy,
+                        paddle::operators::AccuracyOpCUDAKernel<float>,
+                        paddle::operators::AccuracyOpCUDAKernel<double>);
diff --git a/paddle/operators/accuracy_op.h b/paddle/operators/accuracy_op.h
@@ -21,7 +21,7 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class AccuracyKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {

diff --git a/paddle/operators/activation_op.cc b/paddle/operators/activation_op.cc
@@ -611,16 +611,17 @@ REGISTER_OP(hard_sigmoid, ops::ActivationOp, ops::HardSigmoidOpMaker,
 REGISTER_OP(swish, ops::ActivationOp, ops::SwishOpMaker, swish_grad,
             ops::ActivationOpGrad);
 
-#define REGISTER_ACTIVATION_CPU_KERNEL(act_type, functor, grad_functor)       \
-  REGISTER_OP_CPU_KERNEL(                                                     \
-      act_type,                                                               \
-      ops::ActivationKernel<paddle::platform::CPUPlace, ops::functor<float>>, \
-      ops::ActivationKernel<paddle::platform::CPUPlace,                       \
-                            ops::functor<double>>);                           \
-  REGISTER_OP_CPU_KERNEL(                                                     \
-      act_type##_grad, ops::ActivationGradKernel<paddle::platform::CPUPlace,  \
-                                                 ops::grad_functor<float>>,   \
-      ops::ActivationGradKernel<paddle::platform::CPUPlace,                   \
+#define REGISTER_ACTIVATION_CPU_KERNEL(act_type, functor, grad_functor)   \
+  REGISTER_OP_CPU_KERNEL(                                                 \
+      act_type, ops::ActivationKernel<paddle::platform::CPUDeviceContext, \
+                                      ops::functor<float>>,               \
+      ops::ActivationKernel<paddle::platform::CPUDeviceContext,           \
+                            ops::functor<double>>);                       \
+  REGISTER_OP_CPU_KERNEL(                                                 \
+      act_type##_grad,                                                    \
+      ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,       \
+                                ops::grad_functor<float>>,                \
+      ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,       \
                                 ops::grad_functor<double>>);
 
 FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CPU_KERNEL);
diff --git a/paddle/operators/activation_op.cu b/paddle/operators/activation_op.cu
@@ -17,16 +17,17 @@
 
 namespace ops = paddle::operators;
 
-#define REGISTER_ACTIVATION_GPU_KERNEL(act_type, functor, grad_functor)       \
-  REGISTER_OP_GPU_KERNEL(                                                     \
-      act_type,                                                               \
-      ops::ActivationKernel<paddle::platform::GPUPlace, ops::functor<float>>, \
-      ops::ActivationKernel<paddle::platform::GPUPlace,                       \
-                            ops::functor<double>>);                           \
-  REGISTER_OP_GPU_KERNEL(                                                     \
-      act_type##_grad, ops::ActivationGradKernel<paddle::platform::GPUPlace,  \
-                                                 ops::grad_functor<float>>,   \
-      ops::ActivationGradKernel<paddle::platform::GPUPlace,                   \
+#define REGISTER_ACTIVATION_CUDA_KERNEL(act_type, functor, grad_functor)   \
+  REGISTER_OP_CUDA_KERNEL(                                                 \
+      act_type, ops::ActivationKernel<paddle::platform::CUDADeviceContext, \
+                                      ops::functor<float>>,                \
+      ops::ActivationKernel<paddle::platform::CUDADeviceContext,           \
+                            ops::functor<double>>);                        \
+  REGISTER_OP_CUDA_KERNEL(                                                 \
+      act_type##_grad,                                                     \
+      ops::ActivationGradKernel<paddle::platform::CUDADeviceContext,       \
+                                ops::grad_functor<float>>,                 \
+      ops::ActivationGradKernel<paddle::platform::CUDADeviceContext,       \
                                 ops::grad_functor<double>>);
 
-FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_GPU_KERNEL);
+FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CUDA_KERNEL);
diff --git a/paddle/operators/activation_op.h b/paddle/operators/activation_op.h
@@ -19,7 +19,7 @@
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename Functor>
+template <typename DeviceContext, typename Functor>
 class ActivationKernel
     : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
  public:
@@ -32,18 +32,19 @@ class ActivationKernel
 
     auto x = framework::EigenVector<T>::Flatten(*X);
     auto y = framework::EigenVector<T>::Flatten(*Y);
-    auto place = context.GetEigenDevice<Place>();
+    auto* place =
+        context.template device_context<DeviceContext>().eigen_device();
     Functor functor;
 
     auto attrs = functor.GetAttrs();
     for (auto& attr : attrs) {
       *attr.second = context.Attr<float>(attr.first);
     }
-    functor(place, x, y);
+    functor(*place, x, y);
   }
 };
 
-template <typename Place, typename Functor>
+template <typename DeviceContext, typename Functor>
 class ActivationGradKernel
     : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
  public:
@@ -59,13 +60,14 @@ class ActivationGradKernel
     auto x = framework::EigenVector<T>::Flatten(*X);
     auto y = framework::EigenVector<T>::Flatten(*Y);
     auto dx = framework::EigenVector<T>::Flatten(*dX);
-    auto place = context.GetEigenDevice<Place>();
+    auto* place =
+        context.template device_context<DeviceContext>().eigen_device();
     Functor functor;
     auto attrs = functor.GetAttrs();
     for (auto& attr : attrs) {
       *attr.second = context.Attr<float>(attr.first);
     }
-    functor(place, x, y, dy, dx);
+    functor(*place, x, y, dy, dx);
   }
 };
 

diff --git a/paddle/operators/adadelta_op.cc b/paddle/operators/adadelta_op.cc
@@ -109,5 +109,5 @@ param\_out = param + param\_update
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(adadelta, ops::AdadeltaOp, ops::AdadeltaOpMaker);
 REGISTER_OP_CPU_KERNEL(
-    adadelta, ops::AdadeltaOpKernel<paddle::platform::CPUPlace, float>,
-    ops::AdadeltaOpKernel<paddle::platform::CPUPlace, double>);
+    adadelta, ops::AdadeltaOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::AdadeltaOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/adadelta_op.cu b/paddle/operators/adadelta_op.cu
@@ -16,6 +16,6 @@
 #include "paddle/operators/adadelta_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
-    adadelta, ops::AdadeltaOpKernel<paddle::platform::GPUPlace, float>,
-    ops::AdadeltaOpKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_CUDA_KERNEL(
+    adadelta, ops::AdadeltaOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::AdadeltaOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/adadelta_op.h b/paddle/operators/adadelta_op.h
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T>
+template <typename DeviceContext, typename T>
 class AdadeltaOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -51,7 +51,7 @@ class AdadeltaOpKernel : public framework::OpKernel<T> {
         framework::EigenVector<T>::Flatten(*avg_squared_grad_out_tensor);
     auto avg_squared_update_out =
         framework::EigenVector<T>::Flatten(*avg_squared_update_out_tensor);
-    auto place = ctx.GetEigenDevice<Place>();
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
 
     avg_squared_grad_out.device(place) =
         rho * avg_squared_grad + (1 - rho) * grad.square();

diff --git a/paddle/operators/adagrad_op.cc b/paddle/operators/adagrad_op.cc
@@ -100,8 +100,8 @@ size_t FindPos(const std::vector<int64_t>& rows, int64_t value) {
 }  // namespace
 
 template <typename T>
-struct SparseAdagradFunctor<platform::CPUPlace, T> {
-  void operator()(const platform::DeviceContext& context,
+struct SparseAdagradFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& context,
                   const framework::SelectedRows& grad,
                   const framework::Tensor& learning_rate, T epsilon,
                   framework::Tensor* moment, framework::Tensor* param) {
@@ -120,7 +120,7 @@ struct SparseAdagradFunctor<platform::CPUPlace, T> {
             {static_cast<int64_t>(merge_rows.size()), grad_width}),
         context.GetPlace());
 
-    math::SetConstant<platform::CPUPlace, T> constant_functor;
+    math::SetConstant<platform::CPUDeviceContext, T> constant_functor;
     constant_functor(context, grad_merge->mutable_value(), 0.0);
 
     auto* grad_merge_data = grad_merge->mutable_value()->data<T>();
@@ -144,9 +144,9 @@ struct SparseAdagradFunctor<platform::CPUPlace, T> {
     auto gs =
         framework::EigenVector<T>::Flatten(*(grad_square->mutable_value()));
     auto gm = framework::EigenVector<T>::Flatten(grad_merge->value());
-    gs.device(*context.GetEigenDevice<platform::CPUPlace>()) = gm * gm;
+    gs.device(*context.eigen_device()) = gm * gm;
 
-    math::SelectedRowsAddToTensor<platform::CPUPlace, T> functor;
+    math::SelectedRowsAddToTensor<platform::CPUDeviceContext, T> functor;
     functor(context, *grad_square, moment);
 
     // 3. update parameter
@@ -164,13 +164,13 @@ struct SparseAdagradFunctor<platform::CPUPlace, T> {
   }
 };
 
-template struct SparseAdagradFunctor<platform::CPUPlace, float>;
-template struct SparseAdagradFunctor<platform::CPUPlace, double>;
+template struct SparseAdagradFunctor<platform::CPUDeviceContext, float>;
+template struct SparseAdagradFunctor<platform::CPUDeviceContext, double>;
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(adagrad, ops::AdagradOp, ops::AdagradOpMaker);
 REGISTER_OP_CPU_KERNEL(
-    adagrad, ops::AdagradOpKernel<paddle::platform::CPUPlace, float>,
-    ops::AdagradOpKernel<paddle::platform::CPUPlace, double>);
+    adagrad, ops::AdagradOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::AdagradOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/adagrad_op.cu b/paddle/operators/adagrad_op.cu
@@ -72,8 +72,8 @@ __global__ void SparseAdagradFunctorKernel(const T* grad, const int64_t* rows,
 }  // namespace
 
 template <typename T>
-struct SparseAdagradFunctor<platform::GPUPlace, T> {
-  void operator()(const platform::DeviceContext& context,
+struct SparseAdagradFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& context,
                   const framework::SelectedRows& grad,
                   const framework::Tensor& learning_rate, T epsilon,
                   framework::Tensor* moment, framework::Tensor* param) {
@@ -92,7 +92,7 @@ struct SparseAdagradFunctor<platform::GPUPlace, T> {
             {static_cast<int64_t>(merge_rows.size()), grad_width}),
         context.GetPlace());
 
-    math::SetConstant<platform::GPUPlace, T> constant_functor;
+    math::SetConstant<platform::CUDADeviceContext, T> constant_functor;
     constant_functor(context, grad_merge->mutable_value(), 0.0);
 
     auto* grad_merge_data = grad_merge->mutable_value()->data<T>();
@@ -119,9 +119,9 @@ struct SparseAdagradFunctor<platform::GPUPlace, T> {
     auto gs =
         framework::EigenVector<T>::Flatten(*(grad_square->mutable_value()));
     auto gm = framework::EigenVector<T>::Flatten(grad_merge->value());
-    gs.device(*context.GetEigenDevice<platform::GPUPlace>()) = gm * gm;
+    gs.device(*context.eigen_device()) = gm * gm;
 
-    math::SelectedRowsAddToTensor<platform::GPUPlace, T> functor;
+    math::SelectedRowsAddToTensor<platform::CUDADeviceContext, T> functor;
     functor(context, *grad_square, moment);
 
     // 3. update parameter
@@ -139,13 +139,13 @@ struct SparseAdagradFunctor<platform::GPUPlace, T> {
   }
 };
 
-template struct SparseAdagradFunctor<platform::GPUPlace, float>;
-template struct SparseAdagradFunctor<platform::GPUPlace, double>;
+template struct SparseAdagradFunctor<platform::CUDADeviceContext, float>;
+template struct SparseAdagradFunctor<platform::CUDADeviceContext, double>;
 
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(
-    adagrad, ops::AdagradOpKernel<paddle::platform::GPUPlace, float>,
-    ops::AdagradOpKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_CUDA_KERNEL(
+    adagrad, ops::AdagradOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::AdagradOpKernel<paddle::platform::CUDADeviceContext, double>);