Skip to content

Commit

Permalink
Merge branch 'develop' into xpu_pass
Browse files Browse the repository at this point in the history
  • Loading branch information
leolishaohao committed Sep 4, 2023
2 parents 2c0f5cf + c089a2a commit 25b621a
Show file tree
Hide file tree
Showing 295 changed files with 7,134 additions and 3,370 deletions.
6 changes: 3 additions & 3 deletions .clang-tidy
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ bugprone-exception-escape,
-bugprone-fold-init-type,
-bugprone-forwarding-reference-overload,
-bugprone-inaccurate-erase,
-bugprone-incorrect-roundings,
bugprone-incorrect-roundings,
-bugprone-infinite-loop,
bugprone-integer-division,
-bugprone-macro-repeated-side-effects,
Expand Down Expand Up @@ -186,7 +186,7 @@ modernize-use-equals-default,
-modernize-use-noexcept,
modernize-use-nullptr,
modernize-use-override,
-modernize-use-transparent-functors,
modernize-use-transparent-functors,
-modernize-use-uncaught-exceptions,
performance-faster-string-find,
-performance-for-range-copy,
Expand All @@ -197,7 +197,7 @@ performance-inefficient-string-concatenation,
-performance-move-const-arg,
-performance-move-constructor-init,
-performance-no-automatic-move,
-performance-noexcept-move-constructor,
performance-noexcept-move-constructor,
-performance-trivially-destructible,
-performance-type-promotion-in-math-fn,
-performance-unnecessary-copy-initialization,
Expand Down
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ tools/nvcc_lazy

# This file is automatically generated.
# TODO(zhiqiang) Move this file to build directory.
paddle/fluid/pybind/eager_op_function.cc
paddle/fluid/pybind/eager_op_function.*
tools/nvcc_lazy
paddle/phi/kernels/sparse/gpu/cutlass_generator/all_gemm_operations.h
paddle/phi/kernels/sparse/gpu/cutlass_generator/configurations.h
Expand Down
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -605,6 +605,7 @@ if(WITH_CINN)
add_definitions(-DPADDLE_WITH_CINN)

if(CINN_ONLY)
add_definitions(-DCINN_WITH_ONLY)
if(WITH_PYTHON)
add_subdirectory(python)
endif()
Expand Down
8 changes: 4 additions & 4 deletions cmake/cinn.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -168,8 +168,8 @@ cinn_cc_library(
add_dependencies(cinnapi GEN_LLVM_RUNTIME_IR_HEADER ZLIB::ZLIB)
add_dependencies(cinnapi GEN_LLVM_RUNTIME_IR_HEADER ${core_deps})
if(NOT CINN_ONLY)
target_link_libraries(cinnapi phi)
add_dependencies(cinnapi phi)
target_link_libraries(cinnapi pd_dialect phi)
add_dependencies(cinnapi pd_dialect phi)
endif()

target_link_libraries(cinnapi ${PYTHON_LIBRARIES})
Expand Down Expand Up @@ -226,8 +226,8 @@ function(gen_cinncore LINKTYPE)
add_dependencies(${CINNCORE_TARGET} GEN_LLVM_RUNTIME_IR_HEADER ZLIB::ZLIB)
add_dependencies(${CINNCORE_TARGET} GEN_LLVM_RUNTIME_IR_HEADER ${core_deps})
if(NOT CINN_ONLY)
target_link_libraries(${CINNCORE_TARGET} phi)
add_dependencies(${CINNCORE_TARGET} phi)
target_link_libraries(${CINNCORE_TARGET} pd_dialect phi)
add_dependencies(${CINNCORE_TARGET} pd_dialect phi)
endif()

add_dependencies(${CINNCORE_TARGET} pybind)
Expand Down
14 changes: 5 additions & 9 deletions cmake/external/openblas.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,12 @@ set(CBLAS_SOURCE_DIR ${PADDLE_SOURCE_DIR}/third_party/openblas)
set(CBLAS_TAG v0.3.7)
set(CMAKE_VERBOSE_MAKEFILE 1)

# Why use v0.3.18? The IDG business line encountered a random openblas error,
# which can be resolved after upgrading openblas.
# And why compile when gcc>8.2? Please refer to
# https://github.com/spack/spack/issues/19932#issuecomment-733452619
# v0.3.18 only support gcc>=8.3 or gcc>=7.4
if((CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 8.2
# OpenBLAS support Raptor Lake from v0.3.22
if(UNIX
AND NOT APPLE
AND NOT WITH_ROCM
AND NOT WITH_XPU)
# We only compile with openblas 0.3.18 when gcc >= 8.3
set(CBLAS_TAG v0.3.18)
set(CBLAS_TAG v0.3.23)
endif()

if(APPLE AND WITH_ARM)
Expand Down
5 changes: 5 additions & 0 deletions cmake/flags.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,11 @@ if(NOT WIN32)
set(COMMON_FLAGS
-fPIC
-fno-omit-frame-pointer
-pipe
-ffunction-sections
-fdata-sections
-Wl
-gc-sections
-Werror
-Wall
-Wextra
Expand Down
4 changes: 2 additions & 2 deletions paddle/cinn/auto_schedule/auto_tuner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,8 @@ void AutoTuner::Initialize(const Config& config,
const auto& shape_dict = graph_->GetAttrs<
absl::flat_hash_map<std::string, hlir::framework::shape_t>>("infershape");

op_lowerer_ = std::make_unique<hlir::framework::OpLowerer>(
dtype_dict, shape_dict, target_);
op_lowerer_ = std::make_unique<hlir::framework::OpLowerer<GroupPtr>>(
new hlir::framework::OpLowererImpl(dtype_dict, shape_dict, target_));
InitialTaskRegistry* task_registry = InitialTaskRegistry::Global();
for (auto i = 0; i < tasks_.size(); ++i) {
auto&& task = tasks_[i];
Expand Down
4 changes: 2 additions & 2 deletions paddle/cinn/auto_schedule/auto_tuner.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,11 @@

namespace cinn {
namespace auto_schedule {

// This class is entrance of auto-tune, users can use it
// to tune graph (not supported yet) and search a series of schedules
// that maybe more likely to obtain better performance.
// Internally, it creates necessary components and use them to perform tuning.
using GroupPtr = hlir::framework::GroupPtr;
class AutoTuner {
public:
// configure how to perform auto-tune, such as
Expand All @@ -58,7 +58,7 @@ class AutoTuner {
private:
const common::Target& target_;
hlir::framework::Graph* graph_;
std::unique_ptr<hlir::framework::OpLowerer> op_lowerer_;
std::unique_ptr<hlir::framework::OpLowerer<GroupPtr>> op_lowerer_;

// Tasks to tune
std::vector<TuneTask> tasks_;
Expand Down
7 changes: 4 additions & 3 deletions paddle/cinn/auto_schedule/measure/measurer_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include "paddle/cinn/frontend/syntax.h"
#include "paddle/cinn/hlir/framework/graph_compiler.h"
#include "paddle/cinn/hlir/framework/graph_compiler_util.h"
#include "paddle/cinn/hlir/framework/op_lowering.h"
#include "paddle/cinn/runtime/flags.h"

namespace cinn {
Expand Down Expand Up @@ -75,12 +76,12 @@ class TestMeasurer : public ::testing::Test {
absl::flat_hash_map<std::string, hlir::framework::shape_t>>(
"infershape");

auto op_lowerer = std::make_unique<hlir::framework::OpLowerer>(
dtype_dict, shape_dict, target);
auto op_lowerer =
hlir::framework::CreateOpLowerer(dtype_dict, shape_dict, target);
inputs.reserve(tasks.size());
for (int i = 0; i < tasks.size(); ++i) {
auto* task = &tasks[i];
task->Initialize(shape_dict, dtype_dict, op_lowerer.get());
task->Initialize(shape_dict, dtype_dict, &op_lowerer);
MeasureInput input;
input.task = task;
input.lowered_funcs = task->lowered_funcs;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -161,14 +161,14 @@ TEST(AutoInline, AddReluInline) {
"inferdtype");
const auto& shape_dict = graph->GetAttrs<
absl::flat_hash_map<std::string, hlir::framework::shape_t>>("infershape");
auto op_lowerer = std::make_unique<hlir::framework::OpLowerer>(
dtype_dict, shape_dict, target);
auto op_lowerer =
hlir::framework::CreateOpLowerer(dtype_dict, shape_dict, target);

EXPECT_EQ(graph->fusion_groups.size(), 1UL);
std::vector<ir::LoweredFunc> funcs =
op_lowerer->Lower(graph->fusion_groups[0],
/*apply_op_schedule = */ false,
/*apply_group_schedule=*/false);
op_lowerer.Lower(graph->fusion_groups[0],
/*apply_op_schedule = */ false,
/*apply_group_schedule=*/false);

VLOG(6) << "Expr before auto inline: " << funcs[0]->body;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,8 @@ ir::IRSchedule TestAutoGenRuleBase::MakeIRSchedule(
"inferdtype");
auto& shape_dict = graph->GetMutableAttrs<
absl::flat_hash_map<std::string, hlir::framework::shape_t>>("infershape");
hlir::framework::OpLowerer op_lowerer(dtype_dict, shape_dict, target_);
auto op_lowerer =
hlir::framework::CreateOpLowerer(dtype_dict, shape_dict, target_);

lowered_funcs_ =
op_lowerer.Lower(graph->fusion_groups.front(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include "paddle/cinn/auto_schedule/task/task_registry.h"
#include "paddle/cinn/auto_schedule/task/tune_task.h"
#include "paddle/cinn/auto_schedule/tuning.h"
#include "paddle/cinn/hlir/framework/op_lowering.h"
#include "paddle/cinn/ir/ir_base.h"
#include "paddle/cinn/ir/schedule/ir_schedule.h"
#include "test/cpp/cinn/program_builder.h"
Expand All @@ -44,11 +45,11 @@ std::vector<TuneTask> CreateTasks(const frontend::Program& program,
"inferdtype");
const auto& shape_dict = graph->GetAttrs<
absl::flat_hash_map<std::string, hlir::framework::shape_t>>("infershape");
auto op_lowerer = std::make_unique<hlir::framework::OpLowerer>(
dtype_dict, shape_dict, target);
auto op_lowerer =
hlir::framework::CreateOpLowerer(dtype_dict, shape_dict, target);
InitialTaskRegistry* task_registry = InitialTaskRegistry::Global();
for (auto i = 0; i < tasks.size(); ++i) {
tasks[i].Initialize(shape_dict, dtype_dict, op_lowerer.get());
tasks[i].Initialize(shape_dict, dtype_dict, &op_lowerer);
task_registry->Regist(tasks[i].serialized_key,
ir::ModuleExpr(tasks[i].GetLoweredFuncBodyExprs()));
}
Expand Down
7 changes: 3 additions & 4 deletions paddle/cinn/auto_schedule/task/task_registry_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,10 @@ std::vector<TuneTask> CreateTasks(hlir::framework::Graph* graph,
const auto& shape_dict = graph->GetAttrs<
absl::flat_hash_map<std::string, hlir::framework::shape_t>>("infershape");

std::unique_ptr<hlir::framework::OpLowerer> op_lowerer =
std::make_unique<hlir::framework::OpLowerer>(
dtype_dict, shape_dict, target);
auto op_lowerer =
hlir::framework::CreateOpLowerer(dtype_dict, shape_dict, target);
for (TuneTask& task : tasks) {
task.Initialize(shape_dict, dtype_dict, op_lowerer.get());
task.Initialize(shape_dict, dtype_dict, &op_lowerer);
VLOG(3) << "Add a task with serialized_key:\n" << task.serialized_key;
}

Expand Down
2 changes: 1 addition & 1 deletion paddle/cinn/auto_schedule/task/tune_task.cc
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ void TuneTask::Initialize(
const absl::flat_hash_map<std::string, hlir::framework::shape_t>&
shape_dict,
const absl::flat_hash_map<std::string, cinn::common::Type>& dtype_dict,
hlir::framework::OpLowerer* lower_handler) {
hlir::framework::OpLowerer<GroupPtr>* lower_handler) {
CHECK(lower_handler != nullptr) << "op_lowerer can't be nullptr";
op_lowerer = lower_handler;

Expand Down
9 changes: 5 additions & 4 deletions paddle/cinn/auto_schedule/task/tune_task.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,24 +34,25 @@ namespace cinn {
namespace auto_schedule {

class TuneTask {
using GroupPtr = hlir::framework::GroupPtr;

public:
TuneTask() = default;
explicit TuneTask(std::shared_ptr<hlir::framework::Graph::Group> group)
: subgraph(group) {}
explicit TuneTask(GroupPtr group) : subgraph(group) {}
// Initialize a task
void Initialize(
const absl::flat_hash_map<std::string, hlir::framework::shape_t>&
shape_dict,
const absl::flat_hash_map<std::string, cinn::common::Type>& dtype_dict,
hlir::framework::OpLowerer* lower_handler);
hlir::framework::OpLowerer<GroupPtr>* lower_handler);
// Extract bodies in lowered_funcs() and return
std::vector<ir::Expr> GetLoweredFuncBodyExprs() const;

// In CINN, we use hlir::framework::Graph::Group to represent a fused
// sub-graph (if an op won't be fused, it will be a Group with size=1).
std::shared_ptr<hlir::framework::Graph::Group> subgraph;
// Lower handler, Not owned
hlir::framework::OpLowerer* op_lowerer;
hlir::framework::OpLowerer<GroupPtr>* op_lowerer;
// target of this task
common::Target target;
// stores the initial (un-optimized) LoweredFuncs
Expand Down
9 changes: 6 additions & 3 deletions paddle/cinn/auto_schedule/task/tune_task_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,8 @@ TEST(TuneTask, GraphToUnoptLoweredFunc_NoPass) {
const auto& dtype_dict =
graph->GetAttrs<absl::flat_hash_map<std::string, common::Type>>(
"inferdtype");
OpLowerer op_lowerer(dtype_dict, shape_dict, target);
auto op_lowerer =
hlir::framework::CreateOpLowerer(dtype_dict, shape_dict, target);

std::stringstream ss;
for (TuneTask& task : tasks) {
Expand Down Expand Up @@ -187,7 +188,8 @@ TEST(TuneTask, GraphToUnoptLoweredFunc_ApplyPass) {
graph->GetAttrs<absl::flat_hash_map<std::string, common::Type>>(
"inferdtype");

OpLowerer op_lowerer(dtype_dict, shape_dict, target);
OpLowerer op_lowerer(
new hlir::framework::OpLowererImpl(dtype_dict, shape_dict, target));

std::stringstream ss;
for (TuneTask& task : tasks) {
Expand Down Expand Up @@ -291,7 +293,8 @@ TEST(TuneTask, SerializeToString) {
const auto& dtype_dict =
graph->GetAttrs<absl::flat_hash_map<std::string, common::Type>>(
"inferdtype");
OpLowerer op_lowerer(dtype_dict, shape_dict, target);
OpLowerer op_lowerer(
new hlir::framework::OpLowererImpl(dtype_dict, shape_dict, target));
ASSERT_EQ(single_tasks.size(), 2UL);
for (auto&& task : single_tasks) {
task.Initialize(shape_dict, dtype_dict, &op_lowerer);
Expand Down
12 changes: 6 additions & 6 deletions paddle/cinn/auto_schedule/tests/performance_comparison_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include "paddle/cinn/hlir/framework/graph_compiler.h"
#include "paddle/cinn/hlir/framework/graph_compiler_util.h"
#include "paddle/cinn/hlir/framework/node.h"
#include "paddle/cinn/hlir/framework/op_lowering.h"
#include "paddle/cinn/hlir/framework/pass.h"
#include "paddle/cinn/ir/ir_base.h"
#include "paddle/cinn/runtime/flags.h"
Expand Down Expand Up @@ -143,9 +144,8 @@ class PerformanceTester : public ::testing::Test {
absl::flat_hash_map<std::string, hlir::framework::shape_t>>(
"infershape");

std::shared_ptr<hlir::framework::OpLowerer> op_lowerer =
std::make_unique<hlir::framework::OpLowerer>(
dtype_dict, shape_dict, target_);
auto op_lowerer =
hlir::framework::CreateOpLowerer(dtype_dict, shape_dict, target_);

CompilationContext& context = graph_compiler->GetCompilationContext();
context.with_instantiate_variables = true;
Expand All @@ -157,9 +157,9 @@ class PerformanceTester : public ::testing::Test {

for (auto group : graph->fusion_groups) {
context.lowered_funcs.push_back(
op_lowerer->Lower(group,
/*apply_op_schedule = */ false,
/*apply_group_schedule=*/false));
op_lowerer.Lower(group,
/*apply_op_schedule = */ false,
/*apply_group_schedule=*/false));
}

VLOG(3) << "===========================No Schedule LoweredFunc "
Expand Down
1 change: 1 addition & 0 deletions paddle/cinn/backends/compiler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include "paddle/cinn/backends/llvm/runtime_symbol_registry.h"
#include "paddle/cinn/common/context.h"
#include "paddle/cinn/hlir/framework/visualize_helper.h"
#include "paddle/cinn/ir/utils/ir_printer.h"
#ifdef CINN_WITH_CUDA
#include "paddle/cinn/backends/codegen_cuda_dev.h"
#include "paddle/cinn/backends/codegen_cuda_host.h"
Expand Down
Loading

0 comments on commit 25b621a

Please sign in to comment.