resolve conflict with develop

PaddlePaddle · Sep 28, 2022 · b384e8b · b384e8b
2 parents 6cf126d + 844d985
commit b384e8b
Show file tree

Hide file tree

Showing 59 changed files with 2,512 additions and 1,237 deletions.
diff --git a/.flake8 b/.flake8
@@ -16,7 +16,7 @@ ignore =
     E701,E711,E712,E713,E714,E721,E722,E731,E741,
 
     # F, see https://flake8.pycqa.org/en/latest/user/error-codes.html
-    F401,F402,F403,F405,
+    F402,F403,F405,
     F522,F524,F541,
     F601,F631,F632,
     F811,F821,F841,
@@ -28,3 +28,72 @@ per-file-ignores =
     # These files need tabs for testing.
     python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py:E101,W191
     python/paddle/fluid/tests/unittests/collective/fleet/test_hdfs1.py:E101,W191
+    # Ignore unused imports in __init__.py
+    __init__.py: F401
+    # These files will be fixed later
+    r/*:F401
+    cmake/*:F401
+    paddle/*:F401
+    tools/*:F401
+    python/paddle/signal.py:F401
+    python/paddle/common_ops_import.py:F401
+    python/paddle/check_import_scipy.py:F401
+    python/paddle/fft.py:F401
+    python/paddle/_C_ops.py:F401
+    python/paddle/framework/*:F401
+    python/paddle/reader/*:F401
+    python/paddle/nn/*:F401
+    python/paddle/distributed/*:F401
+    python/paddle/proto/*:F401
+    python/paddle/onnx/*:F401
+    python/paddle/optimizer/*:F401
+    python/paddle/hapi/*:F401
+    python/paddle/autograd/*:F401
+    python/paddle/dataset/*:F401
+    python/paddle/io/*:F401
+    python/paddle/cost_model/*:F401
+    python/paddle/tests/*:F401
+    python/paddle/distribution/*:F401
+    python/paddle/geometric/*:F401
+    python/paddle/utils/*:F401
+    python/paddle/vision/*:F401
+    python/paddle/quantization/*:F401
+    python/paddle/libs/*:F401
+    python/paddle/audio/*:F401
+    python/paddle/incubate/*:F401
+    python/paddle/amp/*:F401
+    python/paddle/jit/*:F401
+    python/paddle/static/*:F401
+    python/paddle/inference/*:F401
+    python/paddle/device/*:F401
+    python/paddle/profiler/*:F401
+    python/paddle/tensor/*:F401
+    python/paddle/text/*:F401
+    python/paddle/metric/*:F401
+    python/paddle/fluid/tests/custom_kernel/*:F401
+    python/paddle/fluid/tests/custom_runtime/*:F401
+    python/paddle/fluid/tests/unittests/interpreter/*:F401
+    python/paddle/fluid/tests/unittests/asp/*:F401
+    python/paddle/fluid/tests/unittests/autograd/*:F401
+    python/paddle/fluid/tests/unittests/ir/*:F401
+    python/paddle/fluid/tests/unittests/collective/*:F401
+    python/paddle/fluid/tests/unittests/tokenizer/*:F401
+    python/paddle/fluid/tests/unittests/xpu/*:F401
+    python/paddle/fluid/tests/unittests/distribution/*:F401
+    python/paddle/fluid/tests/unittests/mlu/*:F401
+    python/paddle/fluid/tests/unittests/npu/*:F401
+    python/paddle/fluid/tests/unittests/ipu/*:F401
+    python/paddle/fluid/tests/unittests/distributed_passes/*:F401
+    python/paddle/fluid/tests/unittests/auto_parallel/*:F401
+    python/paddle/fluid/tests/unittests/dygraph_to_static/*:F401
+    python/paddle/fluid/tests/unittests/ps/*:F401
+    python/paddle/fluid/tests/unittests/fft/*:F401
+    python/paddle/fluid/tests/unittests/white_list/*:F401
+    python/paddle/fluid/tests/unittests/sequence/*:F401
+    python/paddle/fluid/tests/unittests/mkldnn/*:F401
+    python/paddle/fluid/tests/unittests/rnn/*:F401
+    python/paddle/fluid/tests/book/*:F401
+    python/paddle/fluid/tests/custom_op/*:F401
+    python/paddle/fluid/tests/unittests/test_*:F401
+    python/paddle/fluid/tests/test_*:F401
+    python/paddle/fluid/tests/*:F401
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -30,11 +30,6 @@ repos:
     hooks:
     -   id: yapf
         files: (.*\.(py|bzl)|BUILD|.*\.BUILD|WORKSPACE)$
-        exclude: |
-            (?x)^(
-                python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py|
-                python/paddle/fluid/tests/unittests/dygraph_to_static/test_origin_info.py
-            )$
 -   repo: https://github.com/PyCQA/flake8
     rev: 4.0.1
     hooks:

diff --git a/cmake/phi.cmake b/cmake/phi.cmake
@@ -78,7 +78,7 @@ function(kernel_declare TARGET_LIST)
     string(
       REGEX
         MATCH
-        "(PD_REGISTER_KERNEL|PD_REGISTER_GENERAL_KERNEL)\\([ \t\r\n]*[a-z0-9_]*,[ \t\r\n\/]*[a-z0-9_]*"
+        "(PD_REGISTER_KERNEL|PD_REGISTER_GENERAL_KERNEL)\\([ \t\r\n]*[a-z0-9_]*,[[ \\\t\r\n\/]*[a-z0-9_]*]?[ \\\t\r\n]*[a-zA-Z]*,[ \\\t\r\n]*[A-Z_]*"
         first_registry
         "${kernel_impl}")
     if(NOT first_registry STREQUAL "")
@@ -89,38 +89,23 @@ function(kernel_declare TARGET_LIST)
           continue()
         endif()
       endif()
-      # parse the first kernel name
-      string(REPLACE "PD_REGISTER_KERNEL(" "" kernel_name "${first_registry}")
-      string(REPLACE "PD_REGISTER_GENERAL_KERNEL(" "" kernel_name
-                     "${kernel_name}")
-      string(REPLACE "," "" kernel_name "${kernel_name}")
-      string(REGEX REPLACE "[ \t\r\n]+" "" kernel_name "${kernel_name}")
-      string(REGEX REPLACE "//cuda_only" "" kernel_name "${kernel_name}")
+      # parse the registerd kernel message
+      string(REPLACE "PD_REGISTER_KERNEL(" "" kernel_msg "${first_registry}")
+      string(REPLACE "PD_REGISTER_GENERAL_KERNEL(" "" kernel_msg
+                     "${kernel_msg}")
+      string(REPLACE "," ";" kernel_msg "${kernel_msg}")
+      string(REGEX REPLACE "[ \\\t\r\n]+" "" kernel_msg "${kernel_msg}")
+      string(REGEX REPLACE "//cuda_only" "" kernel_msg "${kernel_msg}")
+
+      list(GET kernel_msg 0 kernel_name)
+      list(GET kernel_msg 1 kernel_backend)
+      list(GET kernel_msg 2 kernel_layout)
+
       # append kernel declare into declarations.h
-      # TODO(chenweihang): default declare ALL_LAYOUT for each kernel
-      if(${kernel_path} MATCHES "./cpu\/")
-        file(APPEND ${kernel_declare_file}
-             "PD_DECLARE_KERNEL(${kernel_name}, CPU, ALL_LAYOUT);\n")
-      elseif(${kernel_path} MATCHES "./gpu\/")
-        file(APPEND ${kernel_declare_file}
-             "PD_DECLARE_KERNEL(${kernel_name}, GPU, ALL_LAYOUT);\n")
-      elseif(${kernel_path} MATCHES "./xpu\/")
-        file(APPEND ${kernel_declare_file}
-             "PD_DECLARE_KERNEL(${kernel_name}, XPU, ALL_LAYOUT);\n")
-      elseif(${kernel_path} MATCHES "./gpudnn\/")
-        file(APPEND ${kernel_declare_file}
-             "PD_DECLARE_KERNEL(${kernel_name}, GPUDNN, ALL_LAYOUT);\n")
-      elseif(${kernel_path} MATCHES "./kps\/")
-        file(APPEND ${kernel_declare_file}
-             "PD_DECLARE_KERNEL(${kernel_name}, KPS, ALL_LAYOUT);\n")
-      elseif(${kernel_path} MATCHES "./onednn\/")
-        file(APPEND ${kernel_declare_file}
-             "PD_DECLARE_KERNEL(${kernel_name}, OneDNN, ALL_LAYOUT);\n")
-      else()
-        # deal with device independent kernel, now we use CPU temporaary
-        file(APPEND ${kernel_declare_file}
-             "PD_DECLARE_KERNEL(${kernel_name}, CPU, ALL_LAYOUT);\n")
-      endif()
+      file(
+        APPEND ${kernel_declare_file}
+        "PD_DECLARE_KERNEL(${kernel_name}, ${kernel_backend}, ${kernel_layout});\n"
+      )
     endif()
   endforeach()
 endfunction()

diff --git a/paddle/fluid/framework/new_executor/data_transfer.cc b/paddle/fluid/framework/new_executor/data_transfer.cc
@@ -142,11 +142,22 @@ void DataTranferHelper::RunAndConstructOpFuncNode(
   if (phi::KernelFactory::Instance().HasCompatiblePhiKernel(
           op_with_kernel->Type())) {
     auto phi_kernel_key = op_with_kernel->ChoosePhiKernel(exec_ctx);
+    auto phi_kernel_name = op_with_kernel->PhiKernelSignature()->name;
     VLOG(6) << "phi_kernel_key " << phi_kernel_key << "\n";
+    VLOG(6) << "phi_kernel_name " << phi_kernel_name << "\n";
 
     if (op_with_kernel->PhiKernel()->IsValid()) {
       run_phi_kernel = true;
     }
+
+    // For data transfer ops, they should not fallback to cpu.
+    // Though they're device-independent operations,
+    // their implementations are device-related.
+    // For example, consider changing the layout of a gpu tensor
+    // while the gpu kernel of transfer_layout op does not exist.
+    // To use the cpu kernel, you must insert memcpy_d2h/mepcpy_h2d op
+    // in addition. But such operation should not be done here.
+    // Maybe in future we will support this.
   }
 
   // 3. Execute transfer op and construct OpFuncNode

diff --git a/paddle/fluid/framework/phi_utils.cc b/paddle/fluid/framework/phi_utils.cc
@@ -150,6 +150,13 @@ phi::KernelKey FallBackToCpu(const OpKernelType& expected_kernel_key,
         phi::Backend::CPU, kernel_key.layout(), kernel_key.dtype());
   }
 #endif
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  if (platform::is_gpu_place(expected_kernel_key.place_)) {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "For GPU kernel, they must not fallback into CPU kernel."));
+  }
+#endif
+
   return phi::KernelKey();
 }
 

diff --git a/paddle/fluid/operators/fake_quantize_op.cu.h b/paddle/fluid/operators/fake_quantize_op.cu.h
@@ -590,33 +590,29 @@ __global__ void ChannelClipAndQuantDequantKernelQuantAxis0(const T *in,
                                                            const T *scale,
                                                            const int bin_cnt,
                                                            const int round_type,
-                                                           const int n,
-                                                           const int c,
+                                                           const int wh_size,
+                                                           const int num,
+                                                           const int cout,
                                                            T *out) {
-  int tid = threadIdx.x;
-
-  int channel_size = n / c;
-  const T *in_c = in + blockIdx.x * channel_size;
-  T *out_c = out + blockIdx.x * channel_size;
-
-  T s = scale[blockIdx.x];
-  T inv_s = inverse(s);
+  int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
 
-  for (int i = tid; i < channel_size; i += blockDim.x) {
-    T x = in_c[i];
+  for (int64_t i = idx; i < num; i += blockDim.x * gridDim.x) {
+    T s = scale[(i / wh_size) % cout];
+    T inv_s = inverse(s);
+    T x = in[i];
     if (round_type == 0) {
       x = bin_cnt * inv_s * x;
       x = roundWithTiesToEven(x);
       T max_bound = bin_cnt;
       T min_bound = -bin_cnt - static_cast<T>(1);
       x = x > max_bound ? max_bound : x;
       x = x < min_bound ? min_bound : x;
-      out_c[i] = (x * s) / bin_cnt;
+      out[i] = (x * s) / bin_cnt;
     } else {
       T v = x > s ? s : x;
       v = v < -s ? -s : v;
       v = bin_cnt * inv_s * v;
-      out_c[i] = round(v) * s / bin_cnt;
+      out[i] = round(v) * s / bin_cnt;
     }
   }
 }
@@ -627,32 +623,29 @@ __global__ void ChannelClipAndQuantDequantKernelQuantAxis1(const T *in,
                                                            const T *scale,
                                                            const int bin_cnt,
                                                            const int round_type,
-                                                           const int n,
-                                                           const int cin,
+                                                           const int wh_size,
+                                                           const int num,
                                                            const int cout,
                                                            T *out) {
-  T s = scale[blockIdx.x % cout];
-  T inv_s = inverse(s);
-
-  int wh_size = n / (cin * cout);
-  const T *in_c = in + blockIdx.x * wh_size;
-  T *out_c = out + blockIdx.x * wh_size;
+  int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
 
-  for (int i = threadIdx.x; i < wh_size; i += blockDim.x) {
-    T x = in_c[i];
+  for (int64_t i = idx; i < num; i += blockDim.x * gridDim.x) {
+    T s = scale[(i / wh_size) % cout];
+    T inv_s = inverse(s);
+    T x = in[i];
     if (round_type == 0) {
       x = bin_cnt * inv_s * x;
       x = roundWithTiesToEven(x);
       T max_bound = bin_cnt;
       T min_bound = -bin_cnt - static_cast<T>(1);
       x = x > max_bound ? max_bound : x;
       x = x < min_bound ? min_bound : x;
-      out_c[i] = (x * s) / bin_cnt;
+      out[i] = (x * s) / bin_cnt;
     } else {
       T v = x > s ? s : x;
       v = v < -s ? -s : v;
       v = bin_cnt * inv_s * v;
-      out_c[i] = round(v) * s / bin_cnt;
+      out[i] = round(v) * s / bin_cnt;
     }
   }
 }
@@ -682,30 +675,39 @@ struct ChannelClipFakeQuantDequantFunctor<phi::GPUContext, T> {
     const T *scale_data = scale.data<T>();
     T *out_data = out->mutable_data<T>(ctx.GetPlace());
 
+    int64_t block_size =
+        std::min(static_cast<int64_t>(num),
+                 static_cast<int64_t>(ctx.GetMaxThreadsPerBlock() / 4));
+
+    int64_t max_threads = ctx.GetMaxPhysicalThreadCount();  // SM * block_per_SM
+    const int64_t max_blocks =
+        std::max(((max_threads - 1) / block_size + 1), static_cast<int64_t>(1));
+    const int64_t grid_size =
+        std::min(max_blocks, (num + block_size - 1) / block_size);
+
     if (quant_axis == 0) {
-      int grid = in_dims[0];
-      int block = 1024;
+      const int window_size = num / in_dims[0];
       ChannelClipAndQuantDequantKernelQuantAxis0<T>
-          <<<grid, block, 0, ctx.stream()>>>(in_data,
-                                             scale_data,
-                                             bin_cnt,
-                                             round_type,
-                                             num,
-                                             in_dims[0],
-                                             out_data);
+          <<<grid_size, block_size, 0, ctx.stream()>>>(in_data,
+                                                       scale_data,
+                                                       bin_cnt,
+                                                       round_type,
+                                                       window_size,
+                                                       num,
+                                                       in_dims[0],
+                                                       out_data);
     } else if (quant_axis == 1) {
-      int grid = in_dims[0] * in_dims[1];
-      int block = 1024;
+      const int window_size = num / (in_dims[0] * in_dims[1]);
 
       ChannelClipAndQuantDequantKernelQuantAxis1<T>
-          <<<grid, block, 0, ctx.stream()>>>(in_data,
-                                             scale_data,
-                                             bin_cnt,
-                                             round_type,
-                                             num,
-                                             in_dims[0],
-                                             in_dims[1],
-                                             out_data);
+          <<<grid_size, block_size, 0, ctx.stream()>>>(in_data,
+                                                       scale_data,
+                                                       bin_cnt,
+                                                       round_type,
+                                                       window_size,
+                                                       num,
+                                                       in_dims[1],
+                                                       out_data);
     }
   }
 };