From 960d2625c13bd794b5cb137ac42fe2cd93872e59 Mon Sep 17 00:00:00 2001 From: zhangbopd <1299246947@qq.com> Date: Thu, 9 Feb 2023 13:18:46 +0800 Subject: [PATCH 1/4] first commit. --- paddle/phi/kernels/gpu/layer_norm_kernel.cu | 128 ++++++++++++++++++++ 1 file changed, 128 insertions(+) diff --git a/paddle/phi/kernels/gpu/layer_norm_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_kernel.cu index 1350cb2209c31..ba9c0d42fb3e5 100644 --- a/paddle/phi/kernels/gpu/layer_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/layer_norm_kernel.cu @@ -21,6 +21,134 @@ namespace phi { +template +__device__ void WelfordOnline(float val, float *mean, float *m2, float *count) { + *count += 1; + float delta1 = val - *mean; + *mean += delta1 / (*count); + float delta2 = val - *mean; + *m2 += delta1 * delta2; +} + +template +__device__ void WelfordOnline(float b_mean, + float b_m2, + float b_count, + float *mean, + float *m2, + float *count) { + if (b_count == 0) { + return; + } + float new_count = *count + b_count; + float nb_n = b_count / new_count; + float delta = b_mean - *mean; + *mean += delta * nb_n; + *m2 += b_m2 + delta * delta * (*count) * nb_n; + *count = new_count; +} + +__device__ void WelfordWarpAllReduce(float thread_mean, + float thread_m2, + float thread_count, + float *mean, + float *m2, + float *count) { + *mean = thread_mean; + *m2 = thread_m2; + *count = thread_count; + for (int mask = 1; mask < 32; mask *= 2) { + float b_mean = __shfl_down_sync(0xffffffff, *mean, mask); + float b_m2 = __shfl_down_sync(0xffffffff, *m2, mask); + float b_count = __shfl_down_sync(0xffffffff, *count, mask); + WelfordOnline(b_mean, b_m2, b_count, mean, m2, count); + } + + *mean = __shfl_sync(0xffffffff, *mean, 0, 32); + *m2 = __shfl_sync(0xffffffff, *m2, 0, 32); + *count = __shfl_sync(0xffffffff, *count, 0, 32); +} + +template +__global__ void LayerNormFwdWithWelford(T *input, + T *output, + T *gamma, + T *beta, + float *mean, + float *invvar, + int rows, + int cols, + double epsilon) { + int threadidx_x = threadIdx.x / 32; + int threadidx_y = threadIdx.x % 32; + int row_offset = blockIdx.x * 4 + threadidx_x; + int cols_per_thread = (cols + 31) / 32; + int cols_this_thread = cols_per_thread; + + int last_y = (cols / cols_per_thread); + + if (threadidx_y == last_y) { + cols_this_thread = cols - cols_per_thread * last_y; + } else if (threadidx_y > last_y) { + cols_this_thread = 0; + } + + int lane_id = threadidx_y; + + if (row_offset < rows) { + float buf[32]; + + float thread_mean = 0.f; + float thread_m2 = 0.f; + float thread_count = 0.f; + + float warp_mean; + float warp_m2; + float warp_count; + + T *row_input = input + row_offset * cols; + T *row_output = output + row_offset * cols; + +#pragma unroll + for (int i = 0; i < cols_this_thread; i++) { + buf[i] = static_cast(row_input[lane_id * cols_per_thread + i]); + } + +#pragma unroll + for (int i = 0; i < cols_this_thread; i++) { + WelfordOnline(buf[i], &thread_mean, &thread_m2, &thread_count); + } + + WelfordWarpAllReduce(thread_mean, + thread_m2, + thread_count, + &warp_mean, + &warp_m2, + &warp_count); + + float row_mean = warp_mean; + float row_variance = max(warp_m2 / warp_count, 0.f); + float row_inv_var = rsqrt(row_variance + epsilon); + + if (lane_id == 0) { + mean[row_offset] = row_mean; + invvar[row_offset] = row_inv_var; + } + +#pragma unroll + for (int i = 0; i < cols_this_thread; ++i) { + buf[i] = (buf[i] - row_mean) * row_inv_var; + } + +#pragma unroll + for (int i = 0; i < cols_this_thread; ++i) { + row_output[lane_id * cols_per_thread + i] = + static_cast(buf[i]) * gamma[lane_id * cols_per_thread + i] + + beta[lane_id * cols_per_thread + i]; + } + } +} + template void LayerNormDirectCUDAFunctor::operator()(gpuStream_t stream, const T *input, From dfca3ea56fb545dc0e6c291db45fcc868b97a4d3 Mon Sep 17 00:00:00 2001 From: JamesLim-sy Date: Thu, 9 Feb 2023 15:32:36 +0000 Subject: [PATCH 2/4] change host logic --- paddle/phi/kernels/gpu/layer_norm_kernel.cu | 148 ++------------------ 1 file changed, 9 insertions(+), 139 deletions(-) diff --git a/paddle/phi/kernels/gpu/layer_norm_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_kernel.cu index ba9c0d42fb3e5..4dc238208ca1a 100644 --- a/paddle/phi/kernels/gpu/layer_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/layer_norm_kernel.cu @@ -21,134 +21,6 @@ namespace phi { -template -__device__ void WelfordOnline(float val, float *mean, float *m2, float *count) { - *count += 1; - float delta1 = val - *mean; - *mean += delta1 / (*count); - float delta2 = val - *mean; - *m2 += delta1 * delta2; -} - -template -__device__ void WelfordOnline(float b_mean, - float b_m2, - float b_count, - float *mean, - float *m2, - float *count) { - if (b_count == 0) { - return; - } - float new_count = *count + b_count; - float nb_n = b_count / new_count; - float delta = b_mean - *mean; - *mean += delta * nb_n; - *m2 += b_m2 + delta * delta * (*count) * nb_n; - *count = new_count; -} - -__device__ void WelfordWarpAllReduce(float thread_mean, - float thread_m2, - float thread_count, - float *mean, - float *m2, - float *count) { - *mean = thread_mean; - *m2 = thread_m2; - *count = thread_count; - for (int mask = 1; mask < 32; mask *= 2) { - float b_mean = __shfl_down_sync(0xffffffff, *mean, mask); - float b_m2 = __shfl_down_sync(0xffffffff, *m2, mask); - float b_count = __shfl_down_sync(0xffffffff, *count, mask); - WelfordOnline(b_mean, b_m2, b_count, mean, m2, count); - } - - *mean = __shfl_sync(0xffffffff, *mean, 0, 32); - *m2 = __shfl_sync(0xffffffff, *m2, 0, 32); - *count = __shfl_sync(0xffffffff, *count, 0, 32); -} - -template -__global__ void LayerNormFwdWithWelford(T *input, - T *output, - T *gamma, - T *beta, - float *mean, - float *invvar, - int rows, - int cols, - double epsilon) { - int threadidx_x = threadIdx.x / 32; - int threadidx_y = threadIdx.x % 32; - int row_offset = blockIdx.x * 4 + threadidx_x; - int cols_per_thread = (cols + 31) / 32; - int cols_this_thread = cols_per_thread; - - int last_y = (cols / cols_per_thread); - - if (threadidx_y == last_y) { - cols_this_thread = cols - cols_per_thread * last_y; - } else if (threadidx_y > last_y) { - cols_this_thread = 0; - } - - int lane_id = threadidx_y; - - if (row_offset < rows) { - float buf[32]; - - float thread_mean = 0.f; - float thread_m2 = 0.f; - float thread_count = 0.f; - - float warp_mean; - float warp_m2; - float warp_count; - - T *row_input = input + row_offset * cols; - T *row_output = output + row_offset * cols; - -#pragma unroll - for (int i = 0; i < cols_this_thread; i++) { - buf[i] = static_cast(row_input[lane_id * cols_per_thread + i]); - } - -#pragma unroll - for (int i = 0; i < cols_this_thread; i++) { - WelfordOnline(buf[i], &thread_mean, &thread_m2, &thread_count); - } - - WelfordWarpAllReduce(thread_mean, - thread_m2, - thread_count, - &warp_mean, - &warp_m2, - &warp_count); - - float row_mean = warp_mean; - float row_variance = max(warp_m2 / warp_count, 0.f); - float row_inv_var = rsqrt(row_variance + epsilon); - - if (lane_id == 0) { - mean[row_offset] = row_mean; - invvar[row_offset] = row_inv_var; - } - -#pragma unroll - for (int i = 0; i < cols_this_thread; ++i) { - buf[i] = (buf[i] - row_mean) * row_inv_var; - } - -#pragma unroll - for (int i = 0; i < cols_this_thread; ++i) { - row_output[lane_id * cols_per_thread + i] = - static_cast(buf[i]) * gamma[lane_id * cols_per_thread + i] + - beta[lane_id * cols_per_thread + i]; - } - } -} - template void LayerNormDirectCUDAFunctor::operator()(gpuStream_t stream, const T *input, @@ -203,12 +75,14 @@ void LayerNormKernel(const Context &dev_ctx, auto *mean_data = dev_ctx.template Alloc(mean); auto *var_data = dev_ctx.template Alloc(var); - auto *void_scale_data = (scale == nullptr ? nullptr : scale->data()); - auto *void_bias_data = (bias == nullptr ? nullptr : bias->data()); + bool valid_scale = scale != nullptr; + bool valid_bias = bias != nullptr; + auto *void_scale_data = valid_scale ? scale->data() : nullptr; + auto *void_bias_data = valid_bias ? bias->data() : nullptr; auto x_dtype = x.dtype(); phi::DataType scale_bias_dtype; - if (void_scale_data != nullptr) { + if (valid_scale) { scale_bias_dtype = scale->dtype(); if (void_bias_data != nullptr) { PADDLE_ENFORCE_EQ( @@ -218,11 +92,10 @@ void LayerNormKernel(const Context &dev_ctx, "should have the same data type.")); } } else { - scale_bias_dtype = (void_bias_data != nullptr ? bias->dtype() : x_dtype); + scale_bias_dtype = valid_bias ? bias->dtype() : x_dtype; } - bool is_scale_bias_same_dtype_with_x = x_dtype == scale_bias_dtype; - if (!is_scale_bias_same_dtype_with_x) { + if (x_dtype != scale_bias_dtype) { PADDLE_ENFORCE_EQ(scale_bias_dtype, paddle::experimental::CppTypeToDataType::Type(), phi::errors::InvalidArgument( @@ -329,11 +202,8 @@ void LayerNormKernel(const Context &dev_ctx, } } else { #endif - if (is_scale_bias_same_dtype_with_x) { - PADDLE_LAUNCH_LAYERNORM_FWD(T, true); - } else { - PADDLE_LAUNCH_LAYERNORM_FWD(U, false); - } + + PADDLE_LAUNCH_LAYERNORM_FWD(T, true); #ifdef PADDLE_WITH_CUDA } #endif From 9dd40ce61f6f3ad4d265711a734aee6af4dbda38 Mon Sep 17 00:00:00 2001 From: JamesLim-sy Date: Thu, 9 Feb 2023 15:51:13 +0000 Subject: [PATCH 3/4] fix code bugs --- paddle/phi/kernels/gpu/layer_norm_kernel.cu | 52 +++++++++------------ 1 file changed, 21 insertions(+), 31 deletions(-) diff --git a/paddle/phi/kernels/gpu/layer_norm_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_kernel.cu index 4dc238208ca1a..328c6ee4020b6 100644 --- a/paddle/phi/kernels/gpu/layer_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/layer_norm_kernel.cu @@ -95,41 +95,16 @@ void LayerNormKernel(const Context &dev_ctx, scale_bias_dtype = valid_bias ? bias->dtype() : x_dtype; } - if (x_dtype != scale_bias_dtype) { - PADDLE_ENFORCE_EQ(scale_bias_dtype, - paddle::experimental::CppTypeToDataType::Type(), - phi::errors::InvalidArgument( - "Unsupported data type of Scale and Bias")); - } + PADDLE_ENFORCE_EQ( + scale_bias_dtype, + x_dtype, + phi::errors::InvalidArgument("Unsupported data type of Scale and Bias")); auto matrix_dim = phi::flatten_to_2d(x_dims, begin_norm_axis); int64_t batch_size = static_cast(matrix_dim[0]); int64_t feature_size = static_cast(matrix_dim[1]); - auto stream = dev_ctx.stream(); -#define PADDLE_LAUNCH_LAYERNORM_FWD(ScaleBiasT, IsScaleBiasSameDTypeWithX) \ - do { \ - switch (paddle::operators::GetDesiredBlockDim(feature_size)) { \ - FIXED_BLOCK_DIM_CASE( \ - paddle::operators:: \ - LayerNormForward \ - <<>>( \ - x_data, \ - static_cast(void_scale_data), \ - static_cast(void_bias_data), \ - y_data, \ - mean_data, \ - var_data, \ - epsilon, \ - feature_size)); \ - default: \ - PADDLE_THROW(phi::errors::InvalidArgument( \ - "Product from begin_norm_axis to end must be larger than 1")); \ - break; \ - } \ - } while (0) - #define PADDLE_LAUNCH_FAST_LAYERNORM_FWD_BASE(ScaleT, feature_size) \ case (feature_size): { \ constexpr int WARPS_N = feature_size < 1024 ? 1 : (feature_size / 1024); \ @@ -203,12 +178,27 @@ void LayerNormKernel(const Context &dev_ctx, } else { #endif - PADDLE_LAUNCH_LAYERNORM_FWD(T, true); + switch (paddle::operators::GetDesiredBlockDim(feature_size)) { + FIXED_BLOCK_DIM_CASE( + paddle::operators::LayerNormForward + <<>>( + x_data, + static_cast(void_scale_data), + static_cast(void_bias_data), + y_data, + mean_data, + var_data, + epsilon, + feature_size)); + default: + PADDLE_THROW(phi::errors::InvalidArgument( + "Product from begin_norm_axis to end must be larger than 1")); + break; + } #ifdef PADDLE_WITH_CUDA } #endif -#undef PADDLE_LAUNCH_LAYERNORM_FWD #undef PADDLE_LAUNCH_FAST_LAYERNORM_FWD } From 39faa5e2d280c0906f845820b3c8895af322e1f6 Mon Sep 17 00:00:00 2001 From: JamesLim-sy Date: Thu, 9 Feb 2023 16:24:33 +0000 Subject: [PATCH 4/4] fix code error --- paddle/phi/kernels/gpu/layer_norm_kernel.cu | 97 ++++++++------------- 1 file changed, 38 insertions(+), 59 deletions(-) diff --git a/paddle/phi/kernels/gpu/layer_norm_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_kernel.cu index 328c6ee4020b6..b08f860199840 100644 --- a/paddle/phi/kernels/gpu/layer_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/layer_norm_kernel.cu @@ -105,45 +105,31 @@ void LayerNormKernel(const Context &dev_ctx, int64_t feature_size = static_cast(matrix_dim[1]); auto stream = dev_ctx.stream(); -#define PADDLE_LAUNCH_FAST_LAYERNORM_FWD_BASE(ScaleT, feature_size) \ - case (feature_size): { \ - constexpr int WARPS_N = feature_size < 1024 ? 1 : (feature_size / 1024); \ - constexpr int WARPS_M = 4 / WARPS_N; \ - const int THREADS_PER_WARP = 32; \ - const int BYTES_PER_LDG = 16; \ - const int VecSize = BYTES_PER_LDG / sizeof(T); \ - const int THREADS_PER_CTA = WARPS_N * THREADS_PER_WARP * WARPS_M; \ - const int ROWS_PER_CTA = WARPS_M; \ - const int grid = static_cast( \ - std::ceil(batch_size / static_cast(ROWS_PER_CTA))); \ - paddle::operators::fast_ln_fwd_kernel \ - <<>>( \ - batch_size, \ - feature_size, \ - epsilon, \ - x_data, \ - static_cast(void_scale_data), \ - static_cast(void_bias_data), \ - mean_data, \ - var_data, \ - y_data); \ +#define PADDLE_LAUNCH_FAST_LAYERNORM_FWD_BASE(feature_size) \ + case (feature_size): { \ + constexpr int WARPS_N = feature_size < 1024 ? 1 : (feature_size / 1024); \ + constexpr int WARPS_M = 4 / WARPS_N; \ + const int THREADS_PER_WARP = 32; \ + const int BYTES_PER_LDG = 16; \ + const int VecSize = BYTES_PER_LDG / sizeof(T); \ + const int THREADS_PER_CTA = WARPS_N * THREADS_PER_WARP * WARPS_M; \ + const int ROWS_PER_CTA = WARPS_M; \ + const int grid = static_cast( \ + std::ceil(batch_size / static_cast(ROWS_PER_CTA))); \ + paddle::operators:: \ + fast_ln_fwd_kernel \ + <<>>( \ + batch_size, \ + feature_size, \ + epsilon, \ + x_data, \ + static_cast(void_scale_data), \ + static_cast(void_bias_data), \ + mean_data, \ + var_data, \ + y_data); \ } break -#define PADDLE_LAUNCH_FAST_LAYERNORM_FWD(ScaleT) \ - PADDLE_LAUNCH_FAST_LAYERNORM_FWD_BASE(ScaleT, 768); \ - PADDLE_LAUNCH_FAST_LAYERNORM_FWD_BASE(ScaleT, 1024); \ - PADDLE_LAUNCH_FAST_LAYERNORM_FWD_BASE(ScaleT, 1280); \ - PADDLE_LAUNCH_FAST_LAYERNORM_FWD_BASE(ScaleT, 1536); \ - PADDLE_LAUNCH_FAST_LAYERNORM_FWD_BASE(ScaleT, 1792); \ - PADDLE_LAUNCH_FAST_LAYERNORM_FWD_BASE(ScaleT, 2048); \ - PADDLE_LAUNCH_FAST_LAYERNORM_FWD_BASE(ScaleT, 4096) - #ifdef PADDLE_WITH_CUDA bool can_call_fast_kernel = false; if ((feature_size >= 768 && feature_size <= 2048 && feature_size % 256 == 0 || @@ -154,26 +140,20 @@ void LayerNormKernel(const Context &dev_ctx, } if (can_call_fast_kernel) { - if (is_scale_bias_same_dtype_with_x) { - switch (feature_size) { - PADDLE_LAUNCH_FAST_LAYERNORM_FWD(T); - default: - PADDLE_THROW(phi::errors::InvalidArgument( - "Only when feature_size is from 256 to 4096 and is diviaible by " - "256 is supported " - "now")); - break; - } - } else { - switch (feature_size) { - PADDLE_LAUNCH_FAST_LAYERNORM_FWD(U); - default: - PADDLE_THROW(phi::errors::InvalidArgument( - "Only when feature_size is from 256 to 4096 and is diviaible by " - "is supported " - "now")); - break; - } + switch (feature_size) { + PADDLE_LAUNCH_FAST_LAYERNORM_FWD_BASE(768); + PADDLE_LAUNCH_FAST_LAYERNORM_FWD_BASE(1024); + PADDLE_LAUNCH_FAST_LAYERNORM_FWD_BASE(1280); + PADDLE_LAUNCH_FAST_LAYERNORM_FWD_BASE(1536); + PADDLE_LAUNCH_FAST_LAYERNORM_FWD_BASE(1792); + PADDLE_LAUNCH_FAST_LAYERNORM_FWD_BASE(2048); + PADDLE_LAUNCH_FAST_LAYERNORM_FWD_BASE(4096); + default: + PADDLE_THROW(phi::errors::InvalidArgument( + "Only when feature_size is from 256 to 4096 and is diviaible by " + "256 is supported " + "now")); + break; } } else { #endif @@ -198,8 +178,7 @@ void LayerNormKernel(const Context &dev_ctx, #ifdef PADDLE_WITH_CUDA } #endif - -#undef PADDLE_LAUNCH_FAST_LAYERNORM_FWD +#undef PADDLE_LAUNCH_FAST_LAYERNORM_FWD_BASE } } // namespace phi