move fusion_group kernel to phi (#53781)

PaddlePaddle · May 18, 2023 · 26da689 · 26da689
1 parent 0bed220
commit 26da689
Show file tree

Hide file tree

Showing 15 changed files with 216 additions and 205 deletions.
diff --git a/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt b/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt
@@ -6,13 +6,13 @@ if(WITH_GPU OR WITH_ROCM)
   cc_test(
     test_code_generator
     SRCS code_generator_tester.cc
-    DEPS code_generator device_code lod_tensor graph_viz_pass)
+    DEPS code_generator phi_backends lod_tensor graph_viz_pass)
 endif()
 
 cc_library(
   fusion_group_pass
   SRCS fusion_group_pass.cc elementwise_group_detector.cc
-  DEPS subgraph_detector fuse_pass_base code_generator device_code)
+  DEPS subgraph_detector fuse_pass_base code_generator phi_backends)
 cc_test(
   test_fusion_group_pass
   SRCS fusion_group_pass_tester.cc

diff --git a/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc b/paddle/fluid/framework/ir/fusion_group/code_generator_tester.cc
@@ -20,8 +20,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/fusion_group/code_generator.h"
 #include "paddle/fluid/framework/ir/fusion_group/operation.h"
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
-#include "paddle/fluid/platform/device_code.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/backends/device_code.h"
 
 namespace phi {
 class DenseTensor;
@@ -182,7 +182,7 @@ void TestMainImpl(std::string func_name,
                     std::type_index(typeid(paddle::platform::float16));
 
   paddle::platform::CUDAPlace place = paddle::platform::CUDAPlace(0);
-  paddle::platform::CUDADeviceCode device_code(place, func_name, code_str);
+  phi::GPUDeviceCode device_code(place, func_name, code_str);
 #ifdef PADDLE_WITH_HIP
   device_code.Compile(true);
 #else

diff --git a/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.cc b/paddle/fluid/framework/ir/fusion_group/fusion_group_pass.cc
@@ -19,12 +19,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/pass_tester_helper.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
-#include "paddle/fluid/platform/device_code.h"
-namespace paddle {
-namespace platform {
+#include "paddle/phi/backends/device_code.h"
+namespace phi {
 class DeviceCodePool;
-}  // namespace platform
-}  // namespace paddle
+}  // namespace phi
 
 namespace paddle {
 namespace framework {
@@ -36,7 +34,7 @@ void FusionGroupPass::ApplyImpl(ir::Graph* graph) const {
   FusePassBase::Init("fusion_group_pass", graph);
   if (Get<bool>("use_gpu")) {
     // TODO(liuyiqun): open this check.
-    // if (!platform::CUDADeviceCode::IsAvailable()) {
+    // if (!phi::GPUDeviceCode::IsAvailable()) {
     //   LOG(WARNING)
     //       << "Disable fusion_group because CUDA Driver or NVRTC is not
     //       avaiable.";
@@ -54,7 +52,7 @@ void FusionGroupPass::ApplyImpl(ir::Graph* graph) const {
 int FusionGroupPass::DetectFusionGroup(Graph* graph, int type) const {
   // TODO(liuyiqun): supported different places
   platform::CUDAPlace place = platform::CUDAPlace(0);
-  int index = platform::DeviceCodePool::Init({place}).size(place);
+  int index = phi::DeviceCodePool::Init({place}).size(place);
 
   std::vector<std::vector<Node*>> subgraphs =
       fusion_group::ElementwiseGroupDetector()(graph);
@@ -88,11 +86,11 @@ bool FusionGroupPass::GenerateCode(fusion_group::SubGraph* subgraph) const {
 
   // TODO(liuyiqun): supported different places
   platform::CUDAPlace place = platform::CUDAPlace(0);
-  std::unique_ptr<platform::CUDADeviceCode> device_code(
-      new platform::CUDADeviceCode(place, subgraph->GetFuncName(), code_str));
+  std::unique_ptr<phi::GPUDeviceCode> device_code(
+      new phi::GPUDeviceCode(place, subgraph->GetFuncName(), code_str));
   bool is_compiled = device_code->Compile();
   if (is_compiled) {
-    platform::DeviceCodePool& pool = platform::DeviceCodePool::Init({place});
+    phi::DeviceCodePool& pool = phi::DeviceCodePool::Init({place});
     pool.Set(std::move(device_code));
   }
   return is_compiled;

diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt
@@ -73,7 +73,7 @@ if(WITH_GPU OR WITH_ROCM)
   op_library(fused_gate_attention_op)
   # fusion_group
   if(NOT APPLE AND NOT WIN32)
-    op_library(fusion_group_op DEPS device_code)
+    op_library(fusion_group_op)
   endif()
   # fused_bn_add_activation
   # HIP not support bn act fuse in MIOPEN

diff --git a/paddle/fluid/operators/fused/fusion_group_op.cc b/paddle/fluid/operators/fused/fusion_group_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/fused/fusion_group_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {

diff --git a/paddle/fluid/operators/fused/fusion_group_op.h b/paddle/fluid/operators/fused/fusion_group_op.h
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
@@ -356,15 +356,11 @@ if(WITH_ROCM)
 endif()
 
 if(NOT APPLE AND NOT WIN32)
-  cc_library(
-    device_code
-    SRCS device_code.cc
-    DEPS device_context)
   if(WITH_GPU OR WITH_ROCM)
     cc_test(
       device_code_test
       SRCS device_code_test.cc
-      DEPS device_code lod_tensor)
+      DEPS phi_backends lod_tensor)
   endif()
 endif()
 

diff --git a/paddle/fluid/platform/device_code_test.cc b/paddle/fluid/platform/device_code_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/platform/device_code.h"
+#include "paddle/phi/backends/device_code.h"
 
 #include <utility>
 
@@ -47,14 +47,13 @@ void saxpy_kernel(float a, float *x, float* y, float* z, size_t n) {
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 TEST(DeviceCode, cuda) {
-  if (!paddle::platform::dynload::HasNVRTC() ||
-      !paddle::platform::dynload::HasCUDADriver()) {
+  if (!phi::dynload::HasNVRTC() || !phi::dynload::HasCUDADriver()) {
     return;
   }
 
   paddle::framework::InitDevices({0});
-  paddle::platform::CUDAPlace place = paddle::platform::CUDAPlace(0);
-  paddle::platform::CUDADeviceCode code(place, "saxpy_kernel", saxpy_code);
+  phi::GPUPlace place = phi::GPUPlace(0);
+  phi::GPUDeviceCode code(place, "saxpy_kernel", saxpy_code);
 
   phi::DenseTensor cpu_x;
   phi::DenseTensor cpu_y;
@@ -63,8 +62,12 @@ TEST(DeviceCode, cuda) {
   float scale = 2;
   auto dims =
       phi::make_ddim({static_cast<int64_t>(256), static_cast<int64_t>(1024)});
-  cpu_x.mutable_data<float>(dims, paddle::platform::CPUPlace());
-  cpu_y.mutable_data<float>(dims, paddle::platform::CPUPlace());
+  phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
+  auto* cpu_ctx = reinterpret_cast<phi::CPUContext*>(pool.Get(phi::CPUPlace()));
+  cpu_x.Resize(dims);
+  cpu_ctx->template Alloc<float>(&cpu_x);
+  cpu_y.Resize(dims);
+  cpu_ctx->template Alloc<float>(&cpu_y);
 
   size_t n = cpu_x.numel();
   for (size_t i = 0; i < n; ++i) {
@@ -78,9 +81,13 @@ TEST(DeviceCode, cuda) {
   phi::DenseTensor y;
   phi::DenseTensor z;
 
-  float* x_data = x.mutable_data<float>(dims, place);
-  float* y_data = y.mutable_data<float>(dims, place);
-  float* z_data = z.mutable_data<float>(dims, place);
+  auto* dev_ctx = reinterpret_cast<phi::GPUContext*>(pool.Get(place));
+  x.Resize(dims);
+  float* x_data = dev_ctx->template Alloc<float>(&x);
+  y.Resize(dims);
+  float* y_data = dev_ctx->template Alloc<float>(&y);
+  z.Resize(dims);
+  float* z_data = dev_ctx->template Alloc<float>(&z);
 
   paddle::framework::TensorCopySync(cpu_x, place, &x);
   paddle::framework::TensorCopySync(cpu_y, place, &y);
@@ -92,36 +99,33 @@ TEST(DeviceCode, cuda) {
   code.SetWorkloadPerThread(1);
   code.Launch(n, &args);
 
-  auto* dev_ctx = paddle::platform::DeviceContextPool::Instance().Get(place);
   dev_ctx->Wait();
 
-  paddle::framework::TensorCopySync(z, paddle::platform::CPUPlace(), &cpu_z);
+  paddle::framework::TensorCopySync(z, phi::CPUPlace(), &cpu_z);
   for (size_t i = 0; i < n; i++) {
     EXPECT_EQ(cpu_z.data<float>()[i], static_cast<float>(i) * scale + 0.5);
   }
 }
 
 TEST(DeviceCodePool, cuda) {
-  if (!paddle::platform::dynload::HasNVRTC() ||
-      !paddle::platform::dynload::HasCUDADriver()) {
+  if (!phi::dynload::HasNVRTC() || !phi::dynload::HasCUDADriver()) {
     return;
   }
 
   paddle::framework::InitDevices({0});
-  paddle::platform::CUDAPlace place = paddle::platform::CUDAPlace(0);
-  paddle::platform::DeviceCodePool& pool =
-      paddle::platform::DeviceCodePool::Init({place});
+  phi::GPUPlace place = phi::GPUPlace(0);
+  phi::DeviceCodePool& pool = phi::DeviceCodePool::Init({place});
   size_t num_device_codes_before = pool.size(place);
   EXPECT_EQ(num_device_codes_before, 0UL);
 
-  std::unique_ptr<paddle::platform::DeviceCode> code(
-      new paddle::platform::CUDADeviceCode(place, "saxpy_kernel", saxpy_code));
+  std::unique_ptr<phi::DeviceCode> code(
+      new phi::GPUDeviceCode(place, "saxpy_kernel", saxpy_code));
   LOG(INFO) << "origin ptr: " << code.get();
   pool.Set(std::move(code));
   size_t num_device_codes_after = pool.size(place);
   EXPECT_EQ(num_device_codes_after, 1UL);
 
-  paddle::platform::DeviceCode* code_get = pool.Get(place, "saxpy_kernel");
+  phi::DeviceCode* code_get = pool.Get(place, "saxpy_kernel");
   LOG(INFO) << "get ptr: " << code_get;
 }
 #endif
diff --git a/paddle/phi/backends/CMakeLists.txt b/paddle/phi/backends/CMakeLists.txt
@@ -14,6 +14,10 @@ if(WITH_XBYAK)
   list(APPEND BACKENDS_DEPS xbyak)
 endif()
 
+if(NOT APPLE AND NOT WIN32)
+  list(APPEND BACKENDS_SRCS device_code.cc)
+endif()
+
 if(WITH_GPU OR WITH_ROCM)
   list(APPEND BACKENDS_SRCS gpu/gpu_context.cc gpu/gpu_info.cc
        gpu/gpu_resources.cc)