PaddlePaddle · xinyu-intel · Nov 16, 2023 · Nov 1, 2023 · Nov 2, 2023 · Nov 6, 2023
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
@@ -658,6 +658,11 @@ void AnalysisConfig::EnableMKLDNN() {
   Update();
 }
 
+void AnalysisConfig::DisableMKLDNN() {
+  use_mkldnn_ = false;
+  Update();
+}
+
 void AnalysisConfig::SetMkldnnCacheCapacity(int capacity) {
 #ifdef PADDLE_WITH_DNNL
   mkldnn_cache_capacity_ = capacity;
@@ -933,6 +938,24 @@ void AnalysisConfig::Update() {
     }
   }
 
+#ifdef PADDLE_WITH_DNNL
+  // Since EnableMKLDNN is default, the pass_builder has created in the first
+  // time.
+  // Case1: User manually disable mkldnn after pass_builder
+  // create.(config.disable_mkldnn())
+  // Case2: User device is gpu/ipu/xpu, use
+  // EnableXpu(), EnableCUDNN(), PassStrategy has been reset in the above code
+  // block
+  //  Case3: pass_builder_ has been created and belongs to
+  // GpuPassStrategy(or IpuPassStrategy), neither enable mkldnn and
+  // disable mkldnn will be executed
+  if (!use_gpu() && !use_xpu() && !use_ipu() && !use_custom_device() &&
+      !use_mkldnn_) {
+    // User manually disable mkldnn
+    pass_builder()->DisableMKLDNN();
+  }
+#endif
+
   if (use_tensorrt_) {
     pass_builder()->ClearPasses();
     for (const auto &pass : kTRTSubgraphPasses) {
@@ -976,15 +999,13 @@ void AnalysisConfig::Update() {
 #endif
   }
 
-  if (use_mkldnn_) {
+  if (!use_gpu() && !use_xpu() && !use_ipu()) {
+    if (use_mkldnn_ && enable_ir_optim_) {
 #ifdef PADDLE_WITH_DNNL
-    if (!enable_ir_optim_) {
-      LOG(ERROR)
-          << "EnableMKLDNN() only works when IR optimization is enabled.";
-    } else {
+      // default enable mkldnn when device is cpu and enable_ir_optim
       pass_builder()->EnableMKLDNN();
-    }
 #endif
+    }
   }
 
   // Quantization passes must come after all other optimization passes

diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -33,13 +33,13 @@
 #include <vector>
 
 #include "paddle_infer_declare.h"  // NOLINT
-
 /*! \file */
 // Here we include some header files with relative paths, for that in deploy,
 // the abstract path of this header file will be changed.
 #include "paddle_api.h"           // NOLINT
 #include "paddle_pass_builder.h"  // NOLINT
 #ifdef PADDLE_WITH_DNNL
+#include "paddle/phi/backends/cpu/cpu_info.h"
 #include "paddle_mkldnn_quantizer_config.h"  // NOLINT
 #endif
 
@@ -929,6 +929,13 @@ struct PD_INFER_DECL AnalysisConfig {
   ///
   ///
   void EnableMKLDNN();
+
+  ///
+  /// \brief Turn down MKLDNN.
+  ///
+  ///
+  void DisableMKLDNN();
+
   ///
   /// \brief Set the cache capacity of different input shapes for MKLDNN.
   /// Default value 0 means not caching any shape.
@@ -1294,7 +1301,14 @@ struct PD_INFER_DECL AnalysisConfig {
 
   std::unordered_set<std::string> trt_ops_run_float_;
 
+#ifdef PADDLE_WITH_DNNL
+  bool use_mkldnn_{
+      phi::backends::cpu::MayIUse(phi::backends::cpu::cpu_isa_t::avx2) ? true
+                                                                       : false};
+#else
   bool use_mkldnn_{false};
+#endif
+
   std::unordered_set<std::string> mkldnn_enabled_op_types_;
 
   bool model_from_memory_{false};

diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -226,6 +226,39 @@ const std::vector<std::string> kCINNCompilerPasses{
     "build_cinn_pass",
 };
 
+const std::vector<std::string> CpuBasicPasses{
+    "simplify_with_basic_ops_pass",  //
+    "layer_norm_fuse_pass",
+    "attention_lstm_fuse_pass",       //
+    "seqconv_eltadd_relu_fuse_pass",  //
+    // "seqpool_concat_fuse_pass",    //
+    "seqpool_cvm_concat_fuse_pass",  //
+    // "embedding_fc_lstm_fuse_pass", //
+    // TODO(wilber): fix correctness problem.
+    // "fc_lstm_fuse_pass",                    //
+    "mul_lstm_fuse_pass",                      //
+    "fc_gru_fuse_pass",                        //
+    "mul_gru_fuse_pass",                       //
+    "seq_concat_fc_fuse_pass",                 //
+    "gpu_cpu_squeeze2_matmul_fuse_pass",       //
+    "gpu_cpu_reshape2_matmul_fuse_pass",       //
+    "gpu_cpu_flatten2_matmul_fuse_pass",       //
+    "matmul_v2_scale_fuse_pass",               //
+    "gpu_cpu_map_matmul_v2_to_mul_pass",       //
+    "gpu_cpu_map_matmul_v2_to_matmul_pass",    //
+    "matmul_scale_fuse_pass",                  //
+    "gpu_cpu_map_matmul_to_mul_pass",          //
+    "fc_fuse_pass",                            //
+    "repeated_fc_relu_fuse_pass",              //
+    "squared_mat_sub_fuse_pass",               //
+    "conv_bn_fuse_pass",                       //
+    "conv_eltwiseadd_bn_fuse_pass",            //
+    "conv_transpose_bn_fuse_pass",             //
+    "conv_transpose_eltwiseadd_bn_fuse_pass",  //
+    "is_test_pass",                            //
+    "constant_folding_pass",
+};
+
 GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
   passes_.assign({
     "map_op_to_another_pass",                                           //
@@ -309,36 +342,7 @@ void GpuPassStrategy::DisableMkldnnFcPasses() {
 CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) {
   // NOTE the large fusions should be located in the front, so that they will
   // not be damaged by smaller ones.
-  passes_.assign({"simplify_with_basic_ops_pass",  //
-                  "layer_norm_fuse_pass",
-                  "attention_lstm_fuse_pass",       //
-                  "seqconv_eltadd_relu_fuse_pass",  //
-                  // "seqpool_concat_fuse_pass",    //
-                  "seqpool_cvm_concat_fuse_pass",  //
-                  // "embedding_fc_lstm_fuse_pass", //
-                  // TODO(wilber): fix correctness problem.
-                  // "fc_lstm_fuse_pass",                    //
-                  "mul_lstm_fuse_pass",                      //
-                  "fc_gru_fuse_pass",                        //
-                  "mul_gru_fuse_pass",                       //
-                  "seq_concat_fc_fuse_pass",                 //
-                  "gpu_cpu_squeeze2_matmul_fuse_pass",       //
-                  "gpu_cpu_reshape2_matmul_fuse_pass",       //
-                  "gpu_cpu_flatten2_matmul_fuse_pass",       //
-                  "matmul_v2_scale_fuse_pass",               //
-                  "gpu_cpu_map_matmul_v2_to_mul_pass",       //
-                  "gpu_cpu_map_matmul_v2_to_matmul_pass",    //
-                  "matmul_scale_fuse_pass",                  //
-                  "gpu_cpu_map_matmul_to_mul_pass",          //
-                  "fc_fuse_pass",                            //
-                  "repeated_fc_relu_fuse_pass",              //
-                  "squared_mat_sub_fuse_pass",               //
-                  "conv_bn_fuse_pass",                       //
-                  "conv_eltwiseadd_bn_fuse_pass",            //
-                  "conv_transpose_bn_fuse_pass",             //
-                  "conv_transpose_eltwiseadd_bn_fuse_pass",  //
-                  "is_test_pass",                            //
-                  "constant_folding_pass"});
+  passes_.assign(CpuBasicPasses.begin(), CpuBasicPasses.end());
 
   use_gpu_ = false;
 }
@@ -391,6 +395,11 @@ void CpuPassStrategy::EnableMKLDNN() {
 #endif
 }
 
+void CpuPassStrategy::DisableMKLDNN() {
+  ClearPasses();
+  passes_.assign(CpuBasicPasses.begin(), CpuBasicPasses.end());
+}
+
 void CpuPassStrategy::EnableMkldnnQuantizer() {
 #ifdef PADDLE_WITH_DNNL
   if (!use_mkldnn_quantizer_) {

diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -143,6 +143,9 @@ class PD_INFER_DECL PassStrategy : public PaddlePassBuilder {
   /// still be some CPU kernels running in GPU mode.
   virtual void EnableMKLDNN() {}
 
+  /// \brief Disable the use of MKLDNN.
+  virtual void DisableMKLDNN() {}
+
   /// \brief Enable MKLDNN quantize optimization.
   virtual void EnableMkldnnQuantizer() {}
 
@@ -212,6 +215,9 @@ class PD_INFER_DECL CpuPassStrategy : public PassStrategy {
   /// \brief Enable the use of MKLDNN.
   void EnableMKLDNN() override;
 
+  /// \brief Disable the use of MKLDNN.
+  void DisableMKLDNN() override;
+
   /// \brief Enable MKLDNN quantize optimization.
   void EnableMkldnnQuantizer() override;
 

diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
@@ -963,6 +963,7 @@ void BindAnalysisConfig(py::module *m) {
            &AnalysisConfig::SwitchIrDebug,
            py::arg("x") = true)
       .def("enable_mkldnn", &AnalysisConfig::EnableMKLDNN)
+      .def("disable_mkldnn", &AnalysisConfig::DisableMKLDNN)
       .def("mkldnn_enabled", &AnalysisConfig::mkldnn_enabled)
       .def("set_cpu_math_library_num_threads",
            &AnalysisConfig::SetCpuMathLibraryNumThreads)

diff --git a/test/cpp/inference/api/analyzer_bfloat16_image_classification_tester.cc b/test/cpp/inference/api/analyzer_bfloat16_image_classification_tester.cc
@@ -33,7 +33,7 @@ void SetConfig(AnalysisConfig *cfg) {
   cfg->SwitchIrOptim();
   cfg->SwitchSpecifyInputNames();
   cfg->SetCpuMathLibraryNumThreads(FLAGS_num_threads);
-  if (FLAGS_enable_mkldnn) cfg->EnableMKLDNN();
+  if (!FLAGS_enable_mkldnn) cfg->DisableMKLDNN();
 }
 
 TEST(Analyzer_bfloat16_image_classification, bfloat16) {

diff --git a/test/cpp/inference/api/analyzer_transformer_compare_tester.cc b/test/cpp/inference/api/analyzer_transformer_compare_tester.cc
@@ -22,8 +22,8 @@ namespace transformer_tester {
 void compare(bool use_mkldnn = false) {
   AnalysisConfig cfg;
   SetConfig(&cfg);
-  if (use_mkldnn) {
-    cfg.EnableMKLDNN();
+  if (!use_mkldnn) {
+    cfg.DisableMKLDNN();
   }
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;

diff --git a/test/ir/inference/auto_scan_test.py b/test/ir/inference/auto_scan_test.py
@@ -237,8 +237,8 @@ def create_inference_config(
             config.switch_ir_optim(ir_optim)
         if use_gpu:
             config.enable_use_gpu(100, 0)
-        if use_mkldnn:
-            config.enable_mkldnn()
+        if not use_mkldnn:
+            config.disable_mkldnn()
         if use_xpu:
             config.enable_xpu()
         if passes is not None:

diff --git a/test/ir/inference/inference_pass_test.py b/test/ir/inference/inference_pass_test.py
@@ -142,6 +142,7 @@ def _get_analysis_config(
                 self.path + ".pdmodel", self.path + ".pdiparams"
             )
         config.disable_gpu()
+        config.disable_mkldnn()
         config.switch_specify_input_names(True)
         config.switch_ir_optim(True)
         config.switch_use_feed_fetch_ops(False)

diff --git a/test/ir/inference/quant_dequant_test.py b/test/ir/inference/quant_dequant_test.py
@@ -204,6 +204,7 @@ def _get_analysis_config(
                 self.path + ".pdmodel", self.path + ".pdiparams"
             )
         config.disable_gpu()
+        config.disable_mkldnn()
         config.switch_specify_input_names(True)
         config.switch_ir_optim(True)
         config.switch_use_feed_fetch_ops(False)

diff --git a/test/ir/inference/test_conv_act_onednn_fuse_pass.py b/test/ir/inference/test_conv_act_onednn_fuse_pass.py
@@ -21,8 +21,7 @@
 
 class TestConvActOneDNNFusePass(PassAutoScanTest):
     def sample_predictor_configs(self, program_config):
-        config = self.create_inference_config(use_gpu=False)
-        config.enable_mkldnn()
+        config = self.create_inference_config(use_gpu=False, use_mkldnn=True)
         yield config, ['fused_conv2d'], (1e-4, 1e-5)
 
     def is_program_valid(self, prog_config):

diff --git a/test/ir/inference/test_conv_bn_fuse_pass.py b/test/ir/inference/test_conv_bn_fuse_pass.py
@@ -159,8 +159,7 @@ def generate_bn_Var():
     def sample_predictor_configs(self, program_config):
         # for mkldnn
         if program_config.ops[0].attrs['use_mkldnn']:
-            config = self.create_inference_config()
-            config.enable_mkldnn()
+            config = self.create_inference_config(use_mkldnn=True)
             yield config, ['fused_conv2d'], (1e-5, 1e-5)
         else:
             config = self.create_inference_config()

diff --git a/test/ir/inference/test_conv_transpose_bn_fuse_pass.py b/test/ir/inference/test_conv_transpose_bn_fuse_pass.py
@@ -194,12 +194,12 @@ def generate_batch_norm_Variance():
 
     def sample_predictor_configs(self, program_config):
         # for mkldnn
-        config = self.create_inference_config()
         if program_config.ops[0].attrs['use_mkldnn']:
-            config.enable_mkldnn()
+            config = self.create_inference_config(use_mkldnn=True)
             yield config, ['conv2d_transpose'], (1e-5, 1e-5)
         # for cpu
         else:
+            config = self.create_inference_config()
             yield config, ['conv2d_transpose', 'elementwise_add'], (1e-5, 1e-5)
 
     def is_program_valid(self, program_config: ProgramConfig) -> bool:

diff --git a/test/ir/inference/test_conv_transpose_eltwiseadd_bn_fuse_pass.py b/test/ir/inference/test_conv_transpose_eltwiseadd_bn_fuse_pass.py
@@ -220,12 +220,12 @@ def generate_batch_norm_Variance():
 
     def sample_predictor_configs(self, program_config):
         # for mkldnn
-        config = self.create_inference_config()
         if program_config.ops[2].attrs['use_mkldnn']:
-            config.enable_mkldnn()
+            config = self.create_inference_config(use_mkldnn=True)
             yield config, ['conv2d_transpose', 'elementwise_add'], (1e-5, 1e-5)
         # cpu
         else:
+            config = self.create_inference_config()
             yield config, ['conv2d_transpose', 'elementwise_add'], (1e-5, 1e-5)
 
     def is_program_valid(self, program_config: ProgramConfig) -> bool:

diff --git a/test/ir/inference/test_onednn_conv_bias_fuse_pass.py b/test/ir/inference/test_onednn_conv_bias_fuse_pass.py
@@ -21,8 +21,7 @@
 
 class TestConvBiasOneDNNFusePass(PassAutoScanTest):
     def sample_predictor_configs(self, program_config):
-        config = self.create_inference_config(use_gpu=False)
-        config.enable_mkldnn()
+        config = self.create_inference_config(use_gpu=False, use_mkldnn=True)
         yield config, ['fused_conv2d'], (1e-4, 1e-5)
 
     def is_program_valid(self, prog_config):

diff --git a/test/ir/inference/test_onednn_conv_bn_fuse_pass.py b/test/ir/inference/test_onednn_conv_bn_fuse_pass.py
@@ -136,8 +136,7 @@ def generate_data(shape):
         return program_config
 
     def sample_predictor_configs(self, program_config):
-        config = self.create_inference_config()
-        config.enable_mkldnn()
+        config = self.create_inference_config(use_mkldnn=True)
         yield config, ['fused_conv2d'], (1e-5, 1e-5)
 
     def test(self):

diff --git a/test/legacy_test/op_test.py b/test/legacy_test/op_test.py
@@ -1971,6 +1971,8 @@ def check_output_with_place(
     ):
         core._set_prim_all_enabled(False)
         core.set_prim_eager_enabled(False)
+        if not self.is_mkldnn_op():
+            set_flags({"FLAGS_use_mkldnn": False})
 
         if hasattr(self, "use_custom_device") and self.use_custom_device:
             check_dygraph = False
@@ -2917,6 +2919,9 @@ def check_grad_with_place(
         if hasattr(self, "use_custom_device") and self.use_custom_device:
             check_dygraph = False
 
+        if not self.is_mkldnn_op():
+            set_flags({"FLAGS_use_mkldnn": False})
+
         core._set_prim_all_enabled(False)
         core.set_prim_eager_enabled(False)
         if check_prim:

diff --git a/test/legacy_test/test_attribute_var.py b/test/legacy_test/test_attribute_var.py
@@ -44,6 +44,7 @@ def infer_prog(self):
         config = paddle_infer.Config(
             self.save_path + '.pdmodel', self.save_path + '.pdiparams'
         )
+        config.disable_mkldnn()
         predictor = paddle_infer.create_predictor(config)
         input_names = predictor.get_input_names()
         for i, shape in enumerate(self.shapes):

diff --git a/test/tokenizer/test_faster_tokenizer_op.py b/test/tokenizer/test_faster_tokenizer_op.py
@@ -136,6 +136,7 @@ def __init__(self, model_dir):
 
         # fast_tokenizer op only support cpu.
         config.disable_gpu()
+        config.disable_mkldnn()
         config.set_cpu_math_library_num_threads(10)
 
         config.switch_use_feed_fetch_ops(False)