From 960d2625c13bd794b5cb137ac42fe2cd93872e59 Mon Sep 17 00:00:00 2001
From: zhangbopd <1299246947@qq.com>
Date: Thu, 9 Feb 2023 13:18:46 +0800
Subject: [PATCH 1/4] first commit.

---
 paddle/phi/kernels/gpu/layer_norm_kernel.cu | 128 ++++++++++++++++++++
 1 file changed, 128 insertions(+)
diff --git a/paddle/phi/kernels/gpu/layer_norm_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_kernel.cu
index 1350cb2209c31..ba9c0d42fb3e5 100644
--- a/paddle/phi/kernels/gpu/layer_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/layer_norm_kernel.cu
@@ -21,6 +21,134 @@
 
 namespace phi {
 
+template <typename T>
+__device__ void WelfordOnline(float val, float *mean, float *m2, float *count) {
+  *count += 1;
+  float delta1 = val - *mean;
+  *mean += delta1 / (*count);
+  float delta2 = val - *mean;
+  *m2 += delta1 * delta2;
+}
+
+template <typename T>
+__device__ void WelfordOnline(float b_mean,
+                              float b_m2,
+                              float b_count,
+                              float *mean,
+                              float *m2,
+                              float *count) {
+  if (b_count == 0) {
+    return;
+  }
+  float new_count = *count + b_count;
+  float nb_n = b_count / new_count;
+  float delta = b_mean - *mean;
+  *mean += delta * nb_n;
+  *m2 += b_m2 + delta * delta * (*count) * nb_n;
+  *count = new_count;
+}
+
+__device__ void WelfordWarpAllReduce(float thread_mean,
+                                     float thread_m2,
+                                     float thread_count,
+                                     float *mean,
+                                     float *m2,
+                                     float *count) {
+  *mean = thread_mean;
+  *m2 = thread_m2;
+  *count = thread_count;
+  for (int mask = 1; mask < 32; mask *= 2) {
+    float b_mean = __shfl_down_sync(0xffffffff, *mean, mask);
+    float b_m2 = __shfl_down_sync(0xffffffff, *m2, mask);
+    float b_count = __shfl_down_sync(0xffffffff, *count, mask);
+    WelfordOnline(b_mean, b_m2, b_count, mean, m2, count);
+  }
+
+  *mean = __shfl_sync(0xffffffff, *mean, 0, 32);
+  *m2 = __shfl_sync(0xffffffff, *m2, 0, 32);
+  *count = __shfl_sync(0xffffffff, *count, 0, 32);
+}
+
+template <typename T, typename U>
+__global__ void LayerNormFwdWithWelford(T *input,
+                                        T *output,
+                                        T *gamma,
+                                        T *beta,
+                                        float *mean,
+                                        float *invvar,
+                                        int rows,
+                                        int cols,
+                                        double epsilon) {
+  int threadidx_x = threadIdx.x / 32;
+  int threadidx_y = threadIdx.x % 32;
+  int row_offset = blockIdx.x * 4 + threadidx_x;
+  int cols_per_thread = (cols + 31) / 32;
+  int cols_this_thread = cols_per_thread;
+
+  int last_y = (cols / cols_per_thread);
+
+  if (threadidx_y == last_y) {
+    cols_this_thread = cols - cols_per_thread * last_y;
+  } else if (threadidx_y > last_y) {
+    cols_this_thread = 0;
+  }
+
+  int lane_id = threadidx_y;
+
+  if (row_offset < rows) {
+    float buf[32];
+
+    float thread_mean = 0.f;
+    float thread_m2 = 0.f;
+    float thread_count = 0.f;
+
+    float warp_mean;
+    float warp_m2;
+    float warp_count;
+
+    T *row_input = input + row_offset * cols;
+    T *row_output = output + row_offset * cols;
+
+#pragma unroll
+    for (int i = 0; i < cols_this_thread; i++) {
+      buf[i] = static_cast<float>(row_input[lane_id * cols_per_thread + i]);
+    }
+
+#pragma unroll
+    for (int i = 0; i < cols_this_thread; i++) {
+      WelfordOnline(buf[i], &thread_mean, &thread_m2, &thread_count);
+    }
+
+    WelfordWarpAllReduce(thread_mean,
+                         thread_m2,
+                         thread_count,
+                         &warp_mean,
+                         &warp_m2,
+                         &warp_count);
+
+    float row_mean = warp_mean;
+    float row_variance = max(warp_m2 / warp_count, 0.f);
+    float row_inv_var = rsqrt(row_variance + epsilon);
+
+    if (lane_id == 0) {
+      mean[row_offset] = row_mean;
+      invvar[row_offset] = row_inv_var;
+    }
+
+#pragma unroll
+    for (int i = 0; i < cols_this_thread; ++i) {
+      buf[i] = (buf[i] - row_mean) * row_inv_var;
+    }
+
+#pragma unroll
+    for (int i = 0; i < cols_this_thread; ++i) {
+      row_output[lane_id * cols_per_thread + i] =
+          static_cast<T>(buf[i]) * gamma[lane_id * cols_per_thread + i] +
+          beta[lane_id * cols_per_thread + i];
+    }
+  }
+}
+
 template <typename T, typename U>
 void LayerNormDirectCUDAFunctor<T, U>::operator()(gpuStream_t stream,
                                                   const T *input,

From dfca3ea56fb545dc0e6c291db45fcc868b97a4d3 Mon Sep 17 00:00:00 2001
From: JamesLim-sy <santamonic@sina.com>
Date: Thu, 9 Feb 2023 15:32:36 +0000
Subject: [PATCH 2/4] change host logic

---
 paddle/phi/kernels/gpu/layer_norm_kernel.cu | 148 ++------------------
 1 file changed, 9 insertions(+), 139 deletions(-)

diff --git a/paddle/phi/kernels/gpu/layer_norm_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_kernel.cu
index ba9c0d42fb3e5..4dc238208ca1a 100644
--- a/paddle/phi/kernels/gpu/layer_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/layer_norm_kernel.cu
@@ -21,134 +21,6 @@
 
 namespace phi {
 
-template <typename T>
-__device__ void WelfordOnline(float val, float *mean, float *m2, float *count) {
-  *count += 1;
-  float delta1 = val - *mean;
-  *mean += delta1 / (*count);
-  float delta2 = val - *mean;
-  *m2 += delta1 * delta2;
-}
-
-template <typename T>
-__device__ void WelfordOnline(float b_mean,
-                              float b_m2,
-                              float b_count,
-                              float *mean,
-                              float *m2,
-                              float *count) {
-  if (b_count == 0) {
-    return;
-  }
-  float new_count = *count + b_count;
-  float nb_n = b_count / new_count;
-  float delta = b_mean - *mean;
-  *mean += delta * nb_n;
-  *m2 += b_m2 + delta * delta * (*count) * nb_n;
-  *count = new_count;
-}
-
-__device__ void WelfordWarpAllReduce(float thread_mean,
-                                     float thread_m2,
-                                     float thread_count,
-                                     float *mean,
-                                     float *m2,
-                                     float *count) {
-  *mean = thread_mean;
-  *m2 = thread_m2;
-  *count = thread_count;
-  for (int mask = 1; mask < 32; mask *= 2) {
-    float b_mean = __shfl_down_sync(0xffffffff, *mean, mask);
-    float b_m2 = __shfl_down_sync(0xffffffff, *m2, mask);
-    float b_count = __shfl_down_sync(0xffffffff, *count, mask);
-    WelfordOnline(b_mean, b_m2, b_count, mean, m2, count);
-  }
-
-  *mean = __shfl_sync(0xffffffff, *mean, 0, 32);
-  *m2 = __shfl_sync(0xffffffff, *m2, 0, 32);
-  *count = __shfl_sync(0xffffffff, *count, 0, 32);
-}
-
-template <typename T, typename U>
-__global__ void LayerNormFwdWithWelford(T *input,
-                                        T *output,
-                                        T *gamma,
-                                        T *beta,
-                                        float *mean,
-                                        float *invvar,
-                                        int rows,
-                                        int cols,
-                                        double epsilon) {
-  int threadidx_x = threadIdx.x / 32;
-  int threadidx_y = threadIdx.x % 32;
-  int row_offset = blockIdx.x * 4 + threadidx_x;
-  int cols_per_thread = (cols + 31) / 32;
-  int cols_this_thread = cols_per_thread;
-
-  int last_y = (cols / cols_per_thread);
-
-  if (threadidx_y == last_y) {
-    cols_this_thread = cols - cols_per_thread * last_y;
-  } else if (threadidx_y > last_y) {
-    cols_this_thread = 0;
-  }
-
-  int lane_id = threadidx_y;
-
-  if (row_offset < rows) {
-    float buf[32];
-
-    float thread_mean = 0.f;
-    float thread_m2 = 0.f;
-    float thread_count = 0.f;
-
-    float warp_mean;
-    float warp_m2;
-    float warp_count;
-
-    T *row_input = input + row_offset * cols;
-    T *row_output = output + row_offset * cols;
-
-#pragma unroll
-    for (int i = 0; i < cols_this_thread; i++) {
-      buf[i] = static_cast<float>(row_input[lane_id * cols_per_thread + i]);
-    }
-
-#pragma unroll
-    for (int i = 0; i < cols_this_thread; i++) {
-      WelfordOnline(buf[i], &thread_mean, &thread_m2, &thread_count);
-    }
-
-    WelfordWarpAllReduce(thread_mean,
-                         thread_m2,
-                         thread_count,
-                         &warp_mean,
-                         &warp_m2,
-                         &warp_count);
-
-    float row_mean = warp_mean;
-    float row_variance = max(warp_m2 / warp_count, 0.f);
-    float row_inv_var = rsqrt(row_variance + epsilon);
-
-    if (lane_id == 0) {
-      mean[row_offset] = row_mean;
-      invvar[row_offset] = row_inv_var;
-    }
-
-#pragma unroll
-    for (int i = 0; i < cols_this_thread; ++i) {
-      buf[i] = (buf[i] - row_mean) * row_inv_var;
-    }
-
-#pragma unroll
-    for (int i = 0; i < cols_this_thread; ++i) {
-      row_output[lane_id * cols_per_thread + i] =
-          static_cast<T>(buf[i]) * gamma[lane_id * cols_per_thread + i] +
-          beta[lane_id * cols_per_thread + i];
-    }
-  }
-}
-
 template <typename T, typename U>
 void LayerNormDirectCUDAFunctor<T, U>::operator()(gpuStream_t stream,
                                                   const T *input,
@@ -203,12 +75,14 @@ void LayerNormKernel(const Context &dev_ctx,
   auto *mean_data = dev_ctx.template Alloc<U>(mean);
   auto *var_data = dev_ctx.template Alloc<U>(var);
 
-  auto *void_scale_data = (scale == nullptr ? nullptr : scale->data());
-  auto *void_bias_data = (bias == nullptr ? nullptr : bias->data());
+  bool valid_scale = scale != nullptr;
+  bool valid_bias = bias != nullptr;
+  auto *void_scale_data = valid_scale ? scale->data() : nullptr;
+  auto *void_bias_data = valid_bias ? bias->data() : nullptr;
 
   auto x_dtype = x.dtype();
   phi::DataType scale_bias_dtype;
-  if (void_scale_data != nullptr) {
+  if (valid_scale) {
     scale_bias_dtype = scale->dtype();
     if (void_bias_data != nullptr) {
       PADDLE_ENFORCE_EQ(
@@ -218,11 +92,10 @@ void LayerNormKernel(const Context &dev_ctx,
                                        "should have the same data type."));
     }
   } else {
-    scale_bias_dtype = (void_bias_data != nullptr ? bias->dtype() : x_dtype);
+    scale_bias_dtype = valid_bias ? bias->dtype() : x_dtype;
   }
 
-  bool is_scale_bias_same_dtype_with_x = x_dtype == scale_bias_dtype;
-  if (!is_scale_bias_same_dtype_with_x) {
+  if (x_dtype != scale_bias_dtype) {
     PADDLE_ENFORCE_EQ(scale_bias_dtype,
                       paddle::experimental::CppTypeToDataType<U>::Type(),
                       phi::errors::InvalidArgument(
@@ -329,11 +202,8 @@ void LayerNormKernel(const Context &dev_ctx,
     }
   } else {
 #endif
-    if (is_scale_bias_same_dtype_with_x) {
-      PADDLE_LAUNCH_LAYERNORM_FWD(T, true);
-    } else {
-      PADDLE_LAUNCH_LAYERNORM_FWD(U, false);
-    }
+
+    PADDLE_LAUNCH_LAYERNORM_FWD(T, true);
 #ifdef PADDLE_WITH_CUDA
   }
 #endif

From 9dd40ce61f6f3ad4d265711a734aee6af4dbda38 Mon Sep 17 00:00:00 2001
From: JamesLim-sy <santamonic@sina.com>
Date: Thu, 9 Feb 2023 15:51:13 +0000
Subject: [PATCH 3/4] fix code bugs

---
 paddle/phi/kernels/gpu/layer_norm_kernel.cu | 52 +++++++++------------
 1 file changed, 21 insertions(+), 31 deletions(-)

diff --git a/paddle/phi/kernels/gpu/layer_norm_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_kernel.cu
index 4dc238208ca1a..328c6ee4020b6 100644
--- a/paddle/phi/kernels/gpu/layer_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/layer_norm_kernel.cu
@@ -95,41 +95,16 @@ void LayerNormKernel(const Context &dev_ctx,
     scale_bias_dtype = valid_bias ? bias->dtype() : x_dtype;
   }
 
-  if (x_dtype != scale_bias_dtype) {
-    PADDLE_ENFORCE_EQ(scale_bias_dtype,
-                      paddle::experimental::CppTypeToDataType<U>::Type(),
-                      phi::errors::InvalidArgument(
-                          "Unsupported data type of Scale and Bias"));
-  }
+  PADDLE_ENFORCE_EQ(
+      scale_bias_dtype,
+      x_dtype,
+      phi::errors::InvalidArgument("Unsupported data type of Scale and Bias"));
 
   auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis);
   int64_t batch_size = static_cast<int64_t>(matrix_dim[0]);
   int64_t feature_size = static_cast<int64_t>(matrix_dim[1]);
-
   auto stream = dev_ctx.stream();
 
-#define PADDLE_LAUNCH_LAYERNORM_FWD(ScaleBiasT, IsScaleBiasSameDTypeWithX) \
-  do {                                                                     \
-    switch (paddle::operators::GetDesiredBlockDim(feature_size)) {         \
-      FIXED_BLOCK_DIM_CASE(                                                \
-          paddle::operators::                                              \
-              LayerNormForward<T, U, kBlockDim, IsScaleBiasSameDTypeWithX> \
-          <<<batch_size, kBlockDim, 0, stream>>>(                          \
-              x_data,                                                      \
-              static_cast<const ScaleBiasT *>(void_scale_data),            \
-              static_cast<const ScaleBiasT *>(void_bias_data),             \
-              y_data,                                                      \
-              mean_data,                                                   \
-              var_data,                                                    \
-              epsilon,                                                     \
-              feature_size));                                              \
-      default:                                                             \
-        PADDLE_THROW(phi::errors::InvalidArgument(                         \
-            "Product from begin_norm_axis to end must be larger than 1")); \
-        break;                                                             \
-    }                                                                      \
-  } while (0)
-
 #define PADDLE_LAUNCH_FAST_LAYERNORM_FWD_BASE(ScaleT, feature_size)          \
   case (feature_size): {                                                     \
     constexpr int WARPS_N = feature_size < 1024 ? 1 : (feature_size / 1024); \
@@ -203,12 +178,27 @@ void LayerNormKernel(const Context &dev_ctx,
   } else {
 #endif
 
-    PADDLE_LAUNCH_LAYERNORM_FWD(T, true);
+    switch (paddle::operators::GetDesiredBlockDim(feature_size)) {
+      FIXED_BLOCK_DIM_CASE(
+          paddle::operators::LayerNormForward<T, U, kBlockDim, true>
+          <<<batch_size, kBlockDim, 0, stream>>>(
+              x_data,
+              static_cast<const T *>(void_scale_data),
+              static_cast<const T *>(void_bias_data),
+              y_data,
+              mean_data,
+              var_data,
+              epsilon,
+              feature_size));
+      default:
+        PADDLE_THROW(phi::errors::InvalidArgument(
+            "Product from begin_norm_axis to end must be larger than 1"));
+        break;
+    }
 #ifdef PADDLE_WITH_CUDA
   }
 #endif
 
-#undef PADDLE_LAUNCH_LAYERNORM_FWD
 #undef PADDLE_LAUNCH_FAST_LAYERNORM_FWD
 }
 

From 39faa5e2d280c0906f845820b3c8895af322e1f6 Mon Sep 17 00:00:00 2001
From: JamesLim-sy <santamonic@sina.com>
Date: Thu, 9 Feb 2023 16:24:33 +0000
Subject: [PATCH 4/4] fix code error

---
 paddle/phi/kernels/gpu/layer_norm_kernel.cu | 97 ++++++++-------------
 1 file changed, 38 insertions(+), 59 deletions(-)

diff --git a/paddle/phi/kernels/gpu/layer_norm_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_kernel.cu
index 328c6ee4020b6..b08f860199840 100644
--- a/paddle/phi/kernels/gpu/layer_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/layer_norm_kernel.cu
@@ -105,45 +105,31 @@ void LayerNormKernel(const Context &dev_ctx,
   int64_t feature_size = static_cast<int64_t>(matrix_dim[1]);
   auto stream = dev_ctx.stream();
 
-#define PADDLE_LAUNCH_FAST_LAYERNORM_FWD_BASE(ScaleT, feature_size)          \
-  case (feature_size): {                                                     \
-    constexpr int WARPS_N = feature_size < 1024 ? 1 : (feature_size / 1024); \
-    constexpr int WARPS_M = 4 / WARPS_N;                                     \
-    const int THREADS_PER_WARP = 32;                                         \
-    const int BYTES_PER_LDG = 16;                                            \
-    const int VecSize = BYTES_PER_LDG / sizeof(T);                           \
-    const int THREADS_PER_CTA = WARPS_N * THREADS_PER_WARP * WARPS_M;        \
-    const int ROWS_PER_CTA = WARPS_M;                                        \
-    const int grid = static_cast<int>(                                       \
-        std::ceil(batch_size / static_cast<float>(ROWS_PER_CTA)));           \
-    paddle::operators::fast_ln_fwd_kernel<T,                                 \
-                                          U,                                 \
-                                          ScaleT,                            \
-                                          VecSize,                           \
-                                          WARPS_M,                           \
-                                          WARPS_N,                           \
-                                          BYTES_PER_LDG>                     \
-        <<<grid, THREADS_PER_CTA, 0, stream>>>(                              \
-            batch_size,                                                      \
-            feature_size,                                                    \
-            epsilon,                                                         \
-            x_data,                                                          \
-            static_cast<const ScaleT *>(void_scale_data),                    \
-            static_cast<const ScaleT *>(void_bias_data),                     \
-            mean_data,                                                       \
-            var_data,                                                        \
-            y_data);                                                         \
+#define PADDLE_LAUNCH_FAST_LAYERNORM_FWD_BASE(feature_size)                   \
+  case (feature_size): {                                                      \
+    constexpr int WARPS_N = feature_size < 1024 ? 1 : (feature_size / 1024);  \
+    constexpr int WARPS_M = 4 / WARPS_N;                                      \
+    const int THREADS_PER_WARP = 32;                                          \
+    const int BYTES_PER_LDG = 16;                                             \
+    const int VecSize = BYTES_PER_LDG / sizeof(T);                            \
+    const int THREADS_PER_CTA = WARPS_N * THREADS_PER_WARP * WARPS_M;         \
+    const int ROWS_PER_CTA = WARPS_M;                                         \
+    const int grid = static_cast<int>(                                        \
+        std::ceil(batch_size / static_cast<float>(ROWS_PER_CTA)));            \
+    paddle::operators::                                                       \
+        fast_ln_fwd_kernel<T, U, T, VecSize, WARPS_M, WARPS_N, BYTES_PER_LDG> \
+        <<<grid, THREADS_PER_CTA, 0, stream>>>(                               \
+            batch_size,                                                       \
+            feature_size,                                                     \
+            epsilon,                                                          \
+            x_data,                                                           \
+            static_cast<const T *>(void_scale_data),                          \
+            static_cast<const T *>(void_bias_data),                           \
+            mean_data,                                                        \
+            var_data,                                                         \
+            y_data);                                                          \
   } break
 
-#define PADDLE_LAUNCH_FAST_LAYERNORM_FWD(ScaleT)       \
-  PADDLE_LAUNCH_FAST_LAYERNORM_FWD_BASE(ScaleT, 768);  \
-  PADDLE_LAUNCH_FAST_LAYERNORM_FWD_BASE(ScaleT, 1024); \
-  PADDLE_LAUNCH_FAST_LAYERNORM_FWD_BASE(ScaleT, 1280); \
-  PADDLE_LAUNCH_FAST_LAYERNORM_FWD_BASE(ScaleT, 1536); \
-  PADDLE_LAUNCH_FAST_LAYERNORM_FWD_BASE(ScaleT, 1792); \
-  PADDLE_LAUNCH_FAST_LAYERNORM_FWD_BASE(ScaleT, 2048); \
-  PADDLE_LAUNCH_FAST_LAYERNORM_FWD_BASE(ScaleT, 4096)
-
 #ifdef PADDLE_WITH_CUDA
   bool can_call_fast_kernel = false;
   if ((feature_size >= 768 && feature_size <= 2048 && feature_size % 256 == 0 ||
@@ -154,26 +140,20 @@ void LayerNormKernel(const Context &dev_ctx,
   }
 
   if (can_call_fast_kernel) {
-    if (is_scale_bias_same_dtype_with_x) {
-      switch (feature_size) {
-        PADDLE_LAUNCH_FAST_LAYERNORM_FWD(T);
-        default:
-          PADDLE_THROW(phi::errors::InvalidArgument(
-              "Only when feature_size is from 256 to 4096 and is diviaible by "
-              "256 is supported "
-              "now"));
-          break;
-      }
-    } else {
-      switch (feature_size) {
-        PADDLE_LAUNCH_FAST_LAYERNORM_FWD(U);
-        default:
-          PADDLE_THROW(phi::errors::InvalidArgument(
-              "Only when feature_size is from 256 to 4096 and is diviaible by "
-              "is supported "
-              "now"));
-          break;
-      }
+    switch (feature_size) {
+      PADDLE_LAUNCH_FAST_LAYERNORM_FWD_BASE(768);
+      PADDLE_LAUNCH_FAST_LAYERNORM_FWD_BASE(1024);
+      PADDLE_LAUNCH_FAST_LAYERNORM_FWD_BASE(1280);
+      PADDLE_LAUNCH_FAST_LAYERNORM_FWD_BASE(1536);
+      PADDLE_LAUNCH_FAST_LAYERNORM_FWD_BASE(1792);
+      PADDLE_LAUNCH_FAST_LAYERNORM_FWD_BASE(2048);
+      PADDLE_LAUNCH_FAST_LAYERNORM_FWD_BASE(4096);
+      default:
+        PADDLE_THROW(phi::errors::InvalidArgument(
+            "Only when feature_size is from 256 to 4096 and is diviaible by "
+            "256 is supported "
+            "now"));
+        break;
     }
   } else {
 #endif
@@ -198,8 +178,7 @@ void LayerNormKernel(const Context &dev_ctx,
 #ifdef PADDLE_WITH_CUDA
   }
 #endif
-
-#undef PADDLE_LAUNCH_FAST_LAYERNORM_FWD
+#undef PADDLE_LAUNCH_FAST_LAYERNORM_FWD_BASE
 }
 
 }  // namespace phi