Skip to content

Commit

Permalink
move fusion_group kernel to phi (#53781)
Browse files Browse the repository at this point in the history
  • Loading branch information
huangjiyi authored May 18, 2023
1 parent 0bed220 commit 26da689
Show file tree
Hide file tree
Showing 15 changed files with 216 additions and 205 deletions.
4 changes: 2 additions & 2 deletions paddle/fluid/framework/ir/fusion_group/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,13 @@ if(WITH_GPU OR WITH_ROCM)
cc_test(
test_code_generator
SRCS code_generator_tester.cc
DEPS code_generator device_code lod_tensor graph_viz_pass)
DEPS code_generator phi_backends lod_tensor graph_viz_pass)
endif()

cc_library(
fusion_group_pass
SRCS fusion_group_pass.cc elementwise_group_detector.cc
DEPS subgraph_detector fuse_pass_base code_generator device_code)
DEPS subgraph_detector fuse_pass_base code_generator phi_backends)
cc_test(
test_fusion_group_pass
SRCS fusion_group_pass_tester.cc
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ limitations under the License. */
#include "paddle/fluid/framework/ir/fusion_group/code_generator.h"
#include "paddle/fluid/framework/ir/fusion_group/operation.h"
#include "paddle/fluid/framework/ir/pass_tester_helper.h"
#include "paddle/fluid/platform/device_code.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/phi/backends/device_code.h"

namespace phi {
class DenseTensor;
Expand Down Expand Up @@ -182,7 +182,7 @@ void TestMainImpl(std::string func_name,
std::type_index(typeid(paddle::platform::float16));

paddle::platform::CUDAPlace place = paddle::platform::CUDAPlace(0);
paddle::platform::CUDADeviceCode device_code(place, func_name, code_str);
phi::GPUDeviceCode device_code(place, func_name, code_str);
#ifdef PADDLE_WITH_HIP
device_code.Compile(true);
#else
Expand Down
18 changes: 8 additions & 10 deletions paddle/fluid/framework/ir/fusion_group/fusion_group_pass.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,10 @@ limitations under the License. */
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/framework/ir/pass_tester_helper.h"
#include "paddle/fluid/framework/op_proto_maker.h"
#include "paddle/fluid/platform/device_code.h"
namespace paddle {
namespace platform {
#include "paddle/phi/backends/device_code.h"
namespace phi {
class DeviceCodePool;
} // namespace platform
} // namespace paddle
} // namespace phi

namespace paddle {
namespace framework {
Expand All @@ -36,7 +34,7 @@ void FusionGroupPass::ApplyImpl(ir::Graph* graph) const {
FusePassBase::Init("fusion_group_pass", graph);
if (Get<bool>("use_gpu")) {
// TODO(liuyiqun): open this check.
// if (!platform::CUDADeviceCode::IsAvailable()) {
// if (!phi::GPUDeviceCode::IsAvailable()) {
// LOG(WARNING)
// << "Disable fusion_group because CUDA Driver or NVRTC is not
// avaiable.";
Expand All @@ -54,7 +52,7 @@ void FusionGroupPass::ApplyImpl(ir::Graph* graph) const {
int FusionGroupPass::DetectFusionGroup(Graph* graph, int type) const {
// TODO(liuyiqun): supported different places
platform::CUDAPlace place = platform::CUDAPlace(0);
int index = platform::DeviceCodePool::Init({place}).size(place);
int index = phi::DeviceCodePool::Init({place}).size(place);

std::vector<std::vector<Node*>> subgraphs =
fusion_group::ElementwiseGroupDetector()(graph);
Expand Down Expand Up @@ -88,11 +86,11 @@ bool FusionGroupPass::GenerateCode(fusion_group::SubGraph* subgraph) const {

// TODO(liuyiqun): supported different places
platform::CUDAPlace place = platform::CUDAPlace(0);
std::unique_ptr<platform::CUDADeviceCode> device_code(
new platform::CUDADeviceCode(place, subgraph->GetFuncName(), code_str));
std::unique_ptr<phi::GPUDeviceCode> device_code(
new phi::GPUDeviceCode(place, subgraph->GetFuncName(), code_str));
bool is_compiled = device_code->Compile();
if (is_compiled) {
platform::DeviceCodePool& pool = platform::DeviceCodePool::Init({place});
phi::DeviceCodePool& pool = phi::DeviceCodePool::Init({place});
pool.Set(std::move(device_code));
}
return is_compiled;
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/operators/fused/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ if(WITH_GPU OR WITH_ROCM)
op_library(fused_gate_attention_op)
# fusion_group
if(NOT APPLE AND NOT WIN32)
op_library(fusion_group_op DEPS device_code)
op_library(fusion_group_op)
endif()
# fused_bn_add_activation
# HIP not support bn act fuse in MIOPEN
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/operators/fused/fusion_group_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/fluid/operators/fused/fusion_group_op.h"
#include "paddle/fluid/framework/op_registry.h"

namespace paddle {
namespace operators {
Expand Down
99 changes: 0 additions & 99 deletions paddle/fluid/operators/fused/fusion_group_op.h

This file was deleted.

6 changes: 1 addition & 5 deletions paddle/fluid/platform/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -356,15 +356,11 @@ if(WITH_ROCM)
endif()

if(NOT APPLE AND NOT WIN32)
cc_library(
device_code
SRCS device_code.cc
DEPS device_context)
if(WITH_GPU OR WITH_ROCM)
cc_test(
device_code_test
SRCS device_code_test.cc
DEPS device_code lod_tensor)
DEPS phi_backends lod_tensor)
endif()
endif()

Expand Down
44 changes: 24 additions & 20 deletions paddle/fluid/platform/device_code_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/fluid/platform/device_code.h"
#include "paddle/phi/backends/device_code.h"

#include <utility>

Expand Down Expand Up @@ -47,14 +47,13 @@ void saxpy_kernel(float a, float *x, float* y, float* z, size_t n) {

#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
TEST(DeviceCode, cuda) {
if (!paddle::platform::dynload::HasNVRTC() ||
!paddle::platform::dynload::HasCUDADriver()) {
if (!phi::dynload::HasNVRTC() || !phi::dynload::HasCUDADriver()) {
return;
}

paddle::framework::InitDevices({0});
paddle::platform::CUDAPlace place = paddle::platform::CUDAPlace(0);
paddle::platform::CUDADeviceCode code(place, "saxpy_kernel", saxpy_code);
phi::GPUPlace place = phi::GPUPlace(0);
phi::GPUDeviceCode code(place, "saxpy_kernel", saxpy_code);

phi::DenseTensor cpu_x;
phi::DenseTensor cpu_y;
Expand All @@ -63,8 +62,12 @@ TEST(DeviceCode, cuda) {
float scale = 2;
auto dims =
phi::make_ddim({static_cast<int64_t>(256), static_cast<int64_t>(1024)});
cpu_x.mutable_data<float>(dims, paddle::platform::CPUPlace());
cpu_y.mutable_data<float>(dims, paddle::platform::CPUPlace());
phi::DeviceContextPool& pool = phi::DeviceContextPool::Instance();
auto* cpu_ctx = reinterpret_cast<phi::CPUContext*>(pool.Get(phi::CPUPlace()));
cpu_x.Resize(dims);
cpu_ctx->template Alloc<float>(&cpu_x);
cpu_y.Resize(dims);
cpu_ctx->template Alloc<float>(&cpu_y);

size_t n = cpu_x.numel();
for (size_t i = 0; i < n; ++i) {
Expand All @@ -78,9 +81,13 @@ TEST(DeviceCode, cuda) {
phi::DenseTensor y;
phi::DenseTensor z;

float* x_data = x.mutable_data<float>(dims, place);
float* y_data = y.mutable_data<float>(dims, place);
float* z_data = z.mutable_data<float>(dims, place);
auto* dev_ctx = reinterpret_cast<phi::GPUContext*>(pool.Get(place));
x.Resize(dims);
float* x_data = dev_ctx->template Alloc<float>(&x);
y.Resize(dims);
float* y_data = dev_ctx->template Alloc<float>(&y);
z.Resize(dims);
float* z_data = dev_ctx->template Alloc<float>(&z);

paddle::framework::TensorCopySync(cpu_x, place, &x);
paddle::framework::TensorCopySync(cpu_y, place, &y);
Expand All @@ -92,36 +99,33 @@ TEST(DeviceCode, cuda) {
code.SetWorkloadPerThread(1);
code.Launch(n, &args);

auto* dev_ctx = paddle::platform::DeviceContextPool::Instance().Get(place);
dev_ctx->Wait();

paddle::framework::TensorCopySync(z, paddle::platform::CPUPlace(), &cpu_z);
paddle::framework::TensorCopySync(z, phi::CPUPlace(), &cpu_z);
for (size_t i = 0; i < n; i++) {
EXPECT_EQ(cpu_z.data<float>()[i], static_cast<float>(i) * scale + 0.5);
}
}

TEST(DeviceCodePool, cuda) {
if (!paddle::platform::dynload::HasNVRTC() ||
!paddle::platform::dynload::HasCUDADriver()) {
if (!phi::dynload::HasNVRTC() || !phi::dynload::HasCUDADriver()) {
return;
}

paddle::framework::InitDevices({0});
paddle::platform::CUDAPlace place = paddle::platform::CUDAPlace(0);
paddle::platform::DeviceCodePool& pool =
paddle::platform::DeviceCodePool::Init({place});
phi::GPUPlace place = phi::GPUPlace(0);
phi::DeviceCodePool& pool = phi::DeviceCodePool::Init({place});
size_t num_device_codes_before = pool.size(place);
EXPECT_EQ(num_device_codes_before, 0UL);

std::unique_ptr<paddle::platform::DeviceCode> code(
new paddle::platform::CUDADeviceCode(place, "saxpy_kernel", saxpy_code));
std::unique_ptr<phi::DeviceCode> code(
new phi::GPUDeviceCode(place, "saxpy_kernel", saxpy_code));
LOG(INFO) << "origin ptr: " << code.get();
pool.Set(std::move(code));
size_t num_device_codes_after = pool.size(place);
EXPECT_EQ(num_device_codes_after, 1UL);

paddle::platform::DeviceCode* code_get = pool.Get(place, "saxpy_kernel");
phi::DeviceCode* code_get = pool.Get(place, "saxpy_kernel");
LOG(INFO) << "get ptr: " << code_get;
}
#endif
4 changes: 4 additions & 0 deletions paddle/phi/backends/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@ if(WITH_XBYAK)
list(APPEND BACKENDS_DEPS xbyak)
endif()

if(NOT APPLE AND NOT WIN32)
list(APPEND BACKENDS_SRCS device_code.cc)
endif()

if(WITH_GPU OR WITH_ROCM)
list(APPEND BACKENDS_SRCS gpu/gpu_context.cc gpu/gpu_info.cc
gpu/gpu_resources.cc)
Expand Down
Loading

0 comments on commit 26da689

Please sign in to comment.