From af605625a96c2b201e06bc67bc6c992403165a81 Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Mon, 4 Nov 2024 13:21:10 -0800 Subject: [PATCH 01/21] update skip layer norm --- .../contrib_ops/cpu/skip_layer_norm.cc | 116 +++++++++++------- onnxruntime/contrib_ops/cpu/skip_layer_norm.h | 8 +- 2 files changed, 75 insertions(+), 49 deletions(-) diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc index c9ee9e2cb760d..116759c3f31f0 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc @@ -42,14 +42,15 @@ namespace { template || std::is_same_v, void>> void ComputeJob( const T* input_data, - const T* skip_data, const T* gamma_data, const T* beta_data, const T* bias_data, - IAllocatorUniquePtr& skip_float_uptr, - IAllocatorUniquePtr& gamma_float_uptr, - IAllocatorUniquePtr& beta_float_uptr, - IAllocatorUniquePtr& bias_float_uptr, + const T* skip_data, + const float* gamma_float_ptr, + const float* beta_float_ptr, + const float* bias_float_ptr, + float* skip_float_ptr, + bool should_convert_skip, ptrdiff_t task_idx, int hidden_size, int64_t skip_size, @@ -58,10 +59,11 @@ void ComputeJob( T* output_data, T* skip_input_bias_add_output_data, AllocatorPtr alloc) { - ORT_UNUSED_PARAMETER(skip_float_uptr); // only used in MLFloat16 overload - ORT_UNUSED_PARAMETER(gamma_float_uptr); // only used in MLFloat16 overload - ORT_UNUSED_PARAMETER(beta_float_uptr); // only used in MLFloat16 overload - ORT_UNUSED_PARAMETER(bias_float_uptr); // only used in MLFloat16 overload + ORT_UNUSED_PARAMETER(gamma_float_ptr); // only used in MLFloat16 overload + ORT_UNUSED_PARAMETER(beta_float_ptr); // only used in MLFloat16 overload + ORT_UNUSED_PARAMETER(bias_float_ptr); // only used in MLFloat16 overload + ORT_UNUSED_PARAMETER(skip_float_ptr); // only used in MLFloat16 overload + ORT_UNUSED_PARAMETER(should_convert_skip); // only used in MLFloat16 overload ORT_UNUSED_PARAMETER(alloc); auto offset = task_idx * hidden_size; @@ -109,14 +111,15 @@ void ComputeJob( void ComputeJob( const MLFloat16* input_data, - const MLFloat16* skip_data, const MLFloat16* gamma_data, const MLFloat16* beta_data, const MLFloat16* bias_data, - IAllocatorUniquePtr& skip_float_uptr, - IAllocatorUniquePtr& gamma_float_uptr, - IAllocatorUniquePtr& beta_float_uptr, - IAllocatorUniquePtr& bias_float_uptr, + const MLFloat16* skip_data, + const float* gamma_float_ptr, + const float* beta_float_ptr, + const float* bias_float_ptr, + float* skip_float_ptr, + bool should_convert_skip, ptrdiff_t task_idx, int hidden_size, int64_t skip_size, @@ -125,6 +128,11 @@ void ComputeJob( MLFloat16* output_data, MLFloat16* skip_input_bias_add_output_data, AllocatorPtr alloc) { + ORT_UNUSED_PARAMETER(skip_data); // only used in double/float overload + ORT_UNUSED_PARAMETER(gamma_data); // only used in double/float overload + ORT_UNUSED_PARAMETER(beta_data); // only used in double/float overload + ORT_UNUSED_PARAMETER(bias_data); // only used in double/float overload + auto offset = task_idx * hidden_size; const MLFloat16* p_input = input_data + offset; const MLFloat16* p_skip = skip_data + (offset % skip_size); @@ -138,26 +146,18 @@ void ComputeJob( IAllocatorUniquePtr input_float_uptr = IAllocator::MakeUniquePtr(alloc, num_elems); MlasConvertHalfToFloatBuffer(p_input, input_float_uptr.get(), num_elems); - if (!skip_float_uptr) { - skip_float_uptr = IAllocator::MakeUniquePtr(alloc, num_elems); - MlasConvertHalfToFloatBuffer(p_skip, skip_float_uptr.get(), num_elems); - } - - if (bias_data && !bias_float_uptr) { - bias_float_uptr = IAllocator::MakeUniquePtr(alloc, num_elems); - MlasConvertHalfToFloatBuffer(bias_data, bias_float_uptr.get(), num_elems); + if (should_convert_skip) { + MlasConvertHalfToFloatBuffer(p_skip, skip_float_ptr, num_elems); } IAllocatorUniquePtr output_float_uptr = IAllocator::MakeUniquePtr(alloc, num_elems); float* output_float_ptr = output_float_uptr.get(); const float* input_float_ptr = input_float_uptr.get(); - const float* skip_float_ptr = skip_float_uptr.get(); - const float* bias_float_ptr = bias_float_uptr.get(); for (size_t h = 0; h < num_elems; h++) { float val = input_float_ptr[h] + skip_float_ptr[h]; - if (bias_float_uptr) { + if (bias_float_ptr) { val += bias_float_ptr[h]; } @@ -177,22 +177,10 @@ void ComputeJob( mean_square = sqrt(mean_square / hidden_size - mean * mean + epsilon); } - if (!gamma_float_uptr) { - gamma_float_uptr = std::move(input_float_uptr); // overwrite input with gamma values, since they have the same size - MlasConvertHalfToFloatBuffer(gamma_data, gamma_float_uptr.get(), num_elems); - } - - if (beta_data && !beta_float_uptr) { - beta_float_uptr = IAllocator::MakeUniquePtr(alloc, num_elems); - MlasConvertHalfToFloatBuffer(beta_data, beta_float_uptr.get(), num_elems); - } - - const float* gamma_float_ptr = gamma_float_uptr.get(); - const float* beta_float_ptr = beta_float_uptr.get(); for (size_t h = 0; h < num_elems; h++) { if (simplified) { output_float_ptr[h] = output_float_ptr[h] / mean_square * gamma_float_ptr[h]; - } else if (nullptr == beta_float_uptr) { + } else if (nullptr == beta_float_ptr) { output_float_ptr[h] = (output_float_ptr[h] - mean) / mean_square * gamma_float_ptr[h]; } else { output_float_ptr[h] = (output_float_ptr[h] - mean) / mean_square * gamma_float_ptr[h] + beta_float_ptr[h]; @@ -218,7 +206,11 @@ void ConvertMLFloat16ToFloatIfNeeded(const Tensor& tensor, AllocatorPtr alloc, I template SkipLayerNorm::SkipLayerNorm(const OpKernelInfo& op_kernel_info) - : OpKernel(op_kernel_info), skip_fp32_(nullptr), gamma_fp32_(nullptr), beta_fp32_(nullptr), bias_fp32_(nullptr) { + : OpKernel(op_kernel_info), + prepacked_gamma_fp32_data_(nullptr), + prepacked_beta_fp32_data_(nullptr), + prepacked_bias_fp32_data_(nullptr), + prepacked_skip_fp32_data_(nullptr) { ORT_ENFORCE(op_kernel_info.GetAttr("epsilon", &epsilon_).IsOK()); ORT_ENFORCE(epsilon_ >= 0); } @@ -264,11 +256,45 @@ Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { AllocatorPtr alloc; ORT_RETURN_IF_ERROR(p_ctx->GetTempSpaceAllocator(&alloc)); + IAllocatorUniquePtr gamma_fp32; + IAllocatorUniquePtr beta_fp32; + IAllocatorUniquePtr bias_fp32; + IAllocatorUniquePtr skip_fp32; + bool should_convert_skip = false; + if constexpr (std::is_same_v) { + const size_t num_elems = static_cast(hidden_size); + + if (prepacked_gamma_fp32_data_ == nullptr) { + gamma_fp32 = IAllocator::MakeUniquePtr(alloc, num_elems); + MlasConvertHalfToFloatBuffer(gamma_data, gamma_fp32.get(), num_elems); + } + + if (prepacked_beta_fp32_data_ == nullptr && beta_data) { + beta_fp32 = IAllocator::MakeUniquePtr(alloc, num_elems); + MlasConvertHalfToFloatBuffer(beta_data, beta_fp32.get(), num_elems); + } + + if (prepacked_bias_fp32_data_ == nullptr && bias_data) { + bias_fp32 = IAllocator::MakeUniquePtr(alloc, num_elems); + MlasConvertHalfToFloatBuffer(bias_data, bias_fp32.get(), num_elems); + } + + if (prepacked_skip_fp32_data_ == nullptr) { + skip_fp32 = IAllocator::MakeUniquePtr(alloc, num_elems); + should_convert_skip = true; + // skip data will be converted inside ComputeJob, because it needs to use the offset. + } + } + concurrency::ThreadPool::TryBatchParallelFor( p_ctx->GetOperatorThreadPool(), static_cast(task_count), [&](ptrdiff_t task_idx) { - ComputeJob(input_data, skip_data, gamma_data, beta_data, bias_data, skip_fp32_, gamma_fp32_, beta_fp32_, - bias_fp32_, task_idx, hidden_size, skip_size, epsilon_, simplified, output_data, + ComputeJob(input_data, gamma_data, beta_data, bias_data, skip_data, + prepacked_gamma_fp32_data_ ? prepacked_gamma_fp32_data_.get() : gamma_fp32.get(), + prepacked_beta_fp32_data_ ? prepacked_beta_fp32_data_.get() : beta_fp32.get(), + prepacked_bias_fp32_data_ ? prepacked_bias_fp32_data_.get() : bias_fp32.get(), + prepacked_skip_fp32_data_ ? prepacked_skip_fp32_data_.get() : skip_fp32.get(), + should_convert_skip, task_idx, hidden_size, skip_size, epsilon_, simplified, output_data, skip_input_bias_add_output_data, alloc); }, 0); @@ -284,13 +310,13 @@ Status SkipLayerNorm::PrePack(const Tensor& tensor, int input_idx is_packed = false; if (input_idx == 1) { // skip - ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, skip_fp32_, is_packed); + ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, prepacked_skip_fp32_data_, is_packed); } else if (input_idx == 2) { // gamma - ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, gamma_fp32_, is_packed); + ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, prepacked_gamma_fp32_data_, is_packed); } else if (input_idx == 3) { // beta - ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, beta_fp32_, is_packed); + ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, prepacked_beta_fp32_data_, is_packed); } else if (input_idx == 4) { // bias - ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, bias_fp32_, is_packed); + ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, prepacked_bias_fp32_data_, is_packed); } return Status::OK(); diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.h b/onnxruntime/contrib_ops/cpu/skip_layer_norm.h index d904c14857437..83deaf0be8098 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.h +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.h @@ -21,10 +21,10 @@ class SkipLayerNorm final : public OpKernel { private: float epsilon_; - mutable IAllocatorUniquePtr skip_fp32_; - mutable IAllocatorUniquePtr gamma_fp32_; - mutable IAllocatorUniquePtr beta_fp32_; - mutable IAllocatorUniquePtr bias_fp32_; + IAllocatorUniquePtr prepacked_gamma_fp32_data_; + IAllocatorUniquePtr prepacked_beta_fp32_data_; + IAllocatorUniquePtr prepacked_bias_fp32_data_; + IAllocatorUniquePtr prepacked_skip_fp32_data_; }; } // namespace contrib From 00b7d8c102a566e93cd60a6864010d32602cb445 Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Mon, 4 Nov 2024 14:04:01 -0800 Subject: [PATCH 02/21] lint --- onnxruntime/contrib_ops/cpu/skip_layer_norm.cc | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc index 116759c3f31f0..4b9b2ba1fdb56 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc @@ -59,11 +59,11 @@ void ComputeJob( T* output_data, T* skip_input_bias_add_output_data, AllocatorPtr alloc) { - ORT_UNUSED_PARAMETER(gamma_float_ptr); // only used in MLFloat16 overload - ORT_UNUSED_PARAMETER(beta_float_ptr); // only used in MLFloat16 overload - ORT_UNUSED_PARAMETER(bias_float_ptr); // only used in MLFloat16 overload - ORT_UNUSED_PARAMETER(skip_float_ptr); // only used in MLFloat16 overload - ORT_UNUSED_PARAMETER(should_convert_skip); // only used in MLFloat16 overload + ORT_UNUSED_PARAMETER(gamma_float_ptr); // only used in MLFloat16 overload + ORT_UNUSED_PARAMETER(beta_float_ptr); // only used in MLFloat16 overload + ORT_UNUSED_PARAMETER(bias_float_ptr); // only used in MLFloat16 overload + ORT_UNUSED_PARAMETER(skip_float_ptr); // only used in MLFloat16 overload + ORT_UNUSED_PARAMETER(should_convert_skip); // only used in MLFloat16 overload ORT_UNUSED_PARAMETER(alloc); auto offset = task_idx * hidden_size; @@ -128,7 +128,6 @@ void ComputeJob( MLFloat16* output_data, MLFloat16* skip_input_bias_add_output_data, AllocatorPtr alloc) { - ORT_UNUSED_PARAMETER(skip_data); // only used in double/float overload ORT_UNUSED_PARAMETER(gamma_data); // only used in double/float overload ORT_UNUSED_PARAMETER(beta_data); // only used in double/float overload ORT_UNUSED_PARAMETER(bias_data); // only used in double/float overload From a28daa501a697da64f3ae32d2b4d1fc93e7a2321 Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Mon, 4 Nov 2024 14:36:44 -0800 Subject: [PATCH 03/21] Don't get data from a prepacked tensor --- .../contrib_ops/cpu/skip_layer_norm.cc | 75 ++++++++----------- onnxruntime/contrib_ops/cpu/skip_layer_norm.h | 2 +- 2 files changed, 31 insertions(+), 46 deletions(-) diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc index 4b9b2ba1fdb56..803a3f85ff6aa 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc @@ -42,30 +42,17 @@ namespace { template || std::is_same_v, void>> void ComputeJob( const T* input_data, + const T* skip_data, const T* gamma_data, const T* beta_data, const T* bias_data, - const T* skip_data, - const float* gamma_float_ptr, - const float* beta_float_ptr, - const float* bias_float_ptr, - float* skip_float_ptr, - bool should_convert_skip, ptrdiff_t task_idx, int hidden_size, int64_t skip_size, float epsilon, bool simplified, T* output_data, - T* skip_input_bias_add_output_data, - AllocatorPtr alloc) { - ORT_UNUSED_PARAMETER(gamma_float_ptr); // only used in MLFloat16 overload - ORT_UNUSED_PARAMETER(beta_float_ptr); // only used in MLFloat16 overload - ORT_UNUSED_PARAMETER(bias_float_ptr); // only used in MLFloat16 overload - ORT_UNUSED_PARAMETER(skip_float_ptr); // only used in MLFloat16 overload - ORT_UNUSED_PARAMETER(should_convert_skip); // only used in MLFloat16 overload - ORT_UNUSED_PARAMETER(alloc); - + T* skip_input_bias_add_output_data) { auto offset = task_idx * hidden_size; const T* p_input = input_data + offset; const T* p_skip = skip_data + (offset % skip_size); @@ -111,9 +98,6 @@ void ComputeJob( void ComputeJob( const MLFloat16* input_data, - const MLFloat16* gamma_data, - const MLFloat16* beta_data, - const MLFloat16* bias_data, const MLFloat16* skip_data, const float* gamma_float_ptr, const float* beta_float_ptr, @@ -128,10 +112,6 @@ void ComputeJob( MLFloat16* output_data, MLFloat16* skip_input_bias_add_output_data, AllocatorPtr alloc) { - ORT_UNUSED_PARAMETER(gamma_data); // only used in double/float overload - ORT_UNUSED_PARAMETER(beta_data); // only used in double/float overload - ORT_UNUSED_PARAMETER(bias_data); // only used in double/float overload - auto offset = task_idx * hidden_size; const MLFloat16* p_input = input_data + offset; const MLFloat16* p_skip = skip_data + (offset % skip_size); @@ -206,10 +186,10 @@ void ConvertMLFloat16ToFloatIfNeeded(const Tensor& tensor, AllocatorPtr alloc, I template SkipLayerNorm::SkipLayerNorm(const OpKernelInfo& op_kernel_info) : OpKernel(op_kernel_info), + prepacked_skip_fp32_data_(nullptr), prepacked_gamma_fp32_data_(nullptr), prepacked_beta_fp32_data_(nullptr), - prepacked_bias_fp32_data_(nullptr), - prepacked_skip_fp32_data_(nullptr) { + prepacked_bias_fp32_data_(nullptr) { ORT_ENFORCE(op_kernel_info.GetAttr("epsilon", &epsilon_).IsOK()); ORT_ENFORCE(epsilon_ >= 0); } @@ -217,10 +197,10 @@ SkipLayerNorm::SkipLayerNorm(const OpKernelInfo& op_kernel_info) template Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { const Tensor* input = p_ctx->Input(0); - const Tensor* skip = p_ctx->Input(1); - const Tensor* gamma = p_ctx->Input(2); - const Tensor* beta = p_ctx->Input(3); - const Tensor* bias = p_ctx->Input(4); + const Tensor* skip = prepacked_skip_fp32_data_ ? nullptr : p_ctx->Input(1); + const Tensor* gamma = prepacked_gamma_fp32_data_ ? nullptr : p_ctx->Input(2); + const Tensor* beta = prepacked_beta_fp32_data_ ? nullptr : p_ctx->Input(3); + const Tensor* bias = prepacked_bias_fp32_data_ ? nullptr : p_ctx->Input(4); Tensor* output = p_ctx->Output(0, input->Shape()); // For inferencing, we support one more optional output which is the sum of the input and skip tensors Tensor* skip_input_bias_add_output = p_ctx->Output(3, input->Shape()); @@ -240,8 +220,8 @@ Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { int64_t task_count = input->Shape().SizeToDimension(input_dims_size - 1); const T* input_data = input->Data(); - const T* skip_data = skip->Data(); - const T* gamma_data = gamma->Data(); + const T* skip_data = skip == nullptr ? nullptr : skip->Data(); + const T* gamma_data = gamma == nullptr ? nullptr : gamma->Data(); const T* beta_data = beta == nullptr ? nullptr : beta->Data(); const T* bias_data = bias == nullptr ? nullptr : bias->Data(); @@ -255,15 +235,21 @@ Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { AllocatorPtr alloc; ORT_RETURN_IF_ERROR(p_ctx->GetTempSpaceAllocator(&alloc)); + IAllocatorUniquePtr skip_fp32; IAllocatorUniquePtr gamma_fp32; IAllocatorUniquePtr beta_fp32; IAllocatorUniquePtr bias_fp32; - IAllocatorUniquePtr skip_fp32; bool should_convert_skip = false; if constexpr (std::is_same_v) { const size_t num_elems = static_cast(hidden_size); - if (prepacked_gamma_fp32_data_ == nullptr) { + if (prepacked_skip_fp32_data_ == nullptr && skip_data) { + skip_fp32 = IAllocator::MakeUniquePtr(alloc, num_elems); + should_convert_skip = true; + // skip data will be converted inside ComputeJob, because it needs to use the offset. + } + + if (prepacked_gamma_fp32_data_ == nullptr && gamma_data) { gamma_fp32 = IAllocator::MakeUniquePtr(alloc, num_elems); MlasConvertHalfToFloatBuffer(gamma_data, gamma_fp32.get(), num_elems); } @@ -277,24 +263,23 @@ Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { bias_fp32 = IAllocator::MakeUniquePtr(alloc, num_elems); MlasConvertHalfToFloatBuffer(bias_data, bias_fp32.get(), num_elems); } - - if (prepacked_skip_fp32_data_ == nullptr) { - skip_fp32 = IAllocator::MakeUniquePtr(alloc, num_elems); - should_convert_skip = true; - // skip data will be converted inside ComputeJob, because it needs to use the offset. - } } concurrency::ThreadPool::TryBatchParallelFor( p_ctx->GetOperatorThreadPool(), static_cast(task_count), [&](ptrdiff_t task_idx) { - ComputeJob(input_data, gamma_data, beta_data, bias_data, skip_data, - prepacked_gamma_fp32_data_ ? prepacked_gamma_fp32_data_.get() : gamma_fp32.get(), - prepacked_beta_fp32_data_ ? prepacked_beta_fp32_data_.get() : beta_fp32.get(), - prepacked_bias_fp32_data_ ? prepacked_bias_fp32_data_.get() : bias_fp32.get(), - prepacked_skip_fp32_data_ ? prepacked_skip_fp32_data_.get() : skip_fp32.get(), - should_convert_skip, task_idx, hidden_size, skip_size, epsilon_, simplified, output_data, - skip_input_bias_add_output_data, alloc); + if constexpr (std::is_same_v) { + ComputeJob(input_data, skip_data, + prepacked_gamma_fp32_data_ ? prepacked_gamma_fp32_data_.get() : gamma_fp32.get(), + prepacked_beta_fp32_data_ ? prepacked_beta_fp32_data_.get() : beta_fp32.get(), + prepacked_bias_fp32_data_ ? prepacked_bias_fp32_data_.get() : bias_fp32.get(), + prepacked_skip_fp32_data_ ? prepacked_skip_fp32_data_.get() : skip_fp32.get(), + should_convert_skip, task_idx, hidden_size, skip_size, epsilon_, simplified, output_data, + skip_input_bias_add_output_data, alloc); + } else { + ComputeJob(input_data, skip_data, gamma_data, beta_data, bias_data, task_idx, hidden_size, skip_size, + epsilon_, simplified, output_data, skip_input_bias_add_output_data); + } }, 0); diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.h b/onnxruntime/contrib_ops/cpu/skip_layer_norm.h index 83deaf0be8098..fcbb00ee93938 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.h +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.h @@ -21,10 +21,10 @@ class SkipLayerNorm final : public OpKernel { private: float epsilon_; + IAllocatorUniquePtr prepacked_skip_fp32_data_; IAllocatorUniquePtr prepacked_gamma_fp32_data_; IAllocatorUniquePtr prepacked_beta_fp32_data_; IAllocatorUniquePtr prepacked_bias_fp32_data_; - IAllocatorUniquePtr prepacked_skip_fp32_data_; }; } // namespace contrib From 0a6319507c754a3de246877580b0e3daaf92c712 Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Tue, 5 Nov 2024 02:56:25 -0800 Subject: [PATCH 04/21] remove should_convert_skip --- onnxruntime/contrib_ops/cpu/skip_layer_norm.cc | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc index 803a3f85ff6aa..e350c956b439a 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc @@ -103,7 +103,6 @@ void ComputeJob( const float* beta_float_ptr, const float* bias_float_ptr, float* skip_float_ptr, - bool should_convert_skip, ptrdiff_t task_idx, int hidden_size, int64_t skip_size, @@ -114,7 +113,6 @@ void ComputeJob( AllocatorPtr alloc) { auto offset = task_idx * hidden_size; const MLFloat16* p_input = input_data + offset; - const MLFloat16* p_skip = skip_data + (offset % skip_size); MLFloat16* p_output = output_data + offset; MLFloat16* p_skip_input_bias_add_output = skip_input_bias_add_output_data == nullptr ? nullptr : skip_input_bias_add_output_data + offset; @@ -125,7 +123,8 @@ void ComputeJob( IAllocatorUniquePtr input_float_uptr = IAllocator::MakeUniquePtr(alloc, num_elems); MlasConvertHalfToFloatBuffer(p_input, input_float_uptr.get(), num_elems); - if (should_convert_skip) { + if (skip_data) { + const MLFloat16* p_skip = skip_data + (offset % skip_size); MlasConvertHalfToFloatBuffer(p_skip, skip_float_ptr, num_elems); } @@ -239,14 +238,12 @@ Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { IAllocatorUniquePtr gamma_fp32; IAllocatorUniquePtr beta_fp32; IAllocatorUniquePtr bias_fp32; - bool should_convert_skip = false; if constexpr (std::is_same_v) { const size_t num_elems = static_cast(hidden_size); if (prepacked_skip_fp32_data_ == nullptr && skip_data) { skip_fp32 = IAllocator::MakeUniquePtr(alloc, num_elems); - should_convert_skip = true; - // skip data will be converted inside ComputeJob, because it needs to use the offset. + // skip data will be converted inside ComputeJob, because it needs to use an offset based on task_idx. } if (prepacked_gamma_fp32_data_ == nullptr && gamma_data) { @@ -274,7 +271,7 @@ Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { prepacked_beta_fp32_data_ ? prepacked_beta_fp32_data_.get() : beta_fp32.get(), prepacked_bias_fp32_data_ ? prepacked_bias_fp32_data_.get() : bias_fp32.get(), prepacked_skip_fp32_data_ ? prepacked_skip_fp32_data_.get() : skip_fp32.get(), - should_convert_skip, task_idx, hidden_size, skip_size, epsilon_, simplified, output_data, + task_idx, hidden_size, skip_size, epsilon_, simplified, output_data, skip_input_bias_add_output_data, alloc); } else { ComputeJob(input_data, skip_data, gamma_data, beta_data, bias_data, task_idx, hidden_size, skip_size, From b75071bb26bc9ecd6ebd383a29a93f199b641561 Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Tue, 5 Nov 2024 03:10:12 -0800 Subject: [PATCH 05/21] allocate input and output fp32 outside of ComputeJob --- onnxruntime/contrib_ops/cpu/skip_layer_norm.cc | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc index e350c956b439a..dbfb79a4747ff 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc @@ -102,6 +102,8 @@ void ComputeJob( const float* gamma_float_ptr, const float* beta_float_ptr, const float* bias_float_ptr, + float* input_float_ptr, + float* output_float_ptr, float* skip_float_ptr, ptrdiff_t task_idx, int hidden_size, @@ -120,18 +122,13 @@ void ComputeJob( float mean_square(0.0f); const size_t num_elems = static_cast(hidden_size); - IAllocatorUniquePtr input_float_uptr = IAllocator::MakeUniquePtr(alloc, num_elems); - MlasConvertHalfToFloatBuffer(p_input, input_float_uptr.get(), num_elems); + MlasConvertHalfToFloatBuffer(p_input, input_float_ptr, num_elems); if (skip_data) { const MLFloat16* p_skip = skip_data + (offset % skip_size); MlasConvertHalfToFloatBuffer(p_skip, skip_float_ptr, num_elems); } - IAllocatorUniquePtr output_float_uptr = IAllocator::MakeUniquePtr(alloc, num_elems); - float* output_float_ptr = output_float_uptr.get(); - - const float* input_float_ptr = input_float_uptr.get(); for (size_t h = 0; h < num_elems; h++) { float val = input_float_ptr[h] + skip_float_ptr[h]; @@ -234,13 +231,19 @@ Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { AllocatorPtr alloc; ORT_RETURN_IF_ERROR(p_ctx->GetTempSpaceAllocator(&alloc)); + IAllocatorUniquePtr input_fp32; + IAllocatorUniquePtr output_fp32; IAllocatorUniquePtr skip_fp32; IAllocatorUniquePtr gamma_fp32; IAllocatorUniquePtr beta_fp32; IAllocatorUniquePtr bias_fp32; + if constexpr (std::is_same_v) { const size_t num_elems = static_cast(hidden_size); + input_fp32 = IAllocator::MakeUniquePtr(alloc, num_elems); + output_fp32 = IAllocator::MakeUniquePtr(alloc, num_elems); + if (prepacked_skip_fp32_data_ == nullptr && skip_data) { skip_fp32 = IAllocator::MakeUniquePtr(alloc, num_elems); // skip data will be converted inside ComputeJob, because it needs to use an offset based on task_idx. @@ -270,6 +273,7 @@ Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { prepacked_gamma_fp32_data_ ? prepacked_gamma_fp32_data_.get() : gamma_fp32.get(), prepacked_beta_fp32_data_ ? prepacked_beta_fp32_data_.get() : beta_fp32.get(), prepacked_bias_fp32_data_ ? prepacked_bias_fp32_data_.get() : bias_fp32.get(), + input_fp32.get(), output_fp32.get(), prepacked_skip_fp32_data_ ? prepacked_skip_fp32_data_.get() : skip_fp32.get(), task_idx, hidden_size, skip_size, epsilon_, simplified, output_data, skip_input_bias_add_output_data, alloc); From d5850a4d3403955e45d068f65ede6db4f94d4507 Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Wed, 6 Nov 2024 07:31:05 -0800 Subject: [PATCH 06/21] remove unused param --- onnxruntime/contrib_ops/cpu/skip_layer_norm.cc | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc index dbfb79a4747ff..5bffe865b9c99 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc @@ -111,8 +111,7 @@ void ComputeJob( float epsilon, bool simplified, MLFloat16* output_data, - MLFloat16* skip_input_bias_add_output_data, - AllocatorPtr alloc) { + MLFloat16* skip_input_bias_add_output_data) { auto offset = task_idx * hidden_size; const MLFloat16* p_input = input_data + offset; MLFloat16* p_output = output_data + offset; @@ -276,7 +275,7 @@ Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { input_fp32.get(), output_fp32.get(), prepacked_skip_fp32_data_ ? prepacked_skip_fp32_data_.get() : skip_fp32.get(), task_idx, hidden_size, skip_size, epsilon_, simplified, output_data, - skip_input_bias_add_output_data, alloc); + skip_input_bias_add_output_data); } else { ComputeJob(input_data, skip_data, gamma_data, beta_data, bias_data, task_idx, hidden_size, skip_size, epsilon_, simplified, output_data, skip_input_bias_add_output_data); From bc2c8207bde69de1de426bba10b8783401ef1d99 Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Wed, 6 Nov 2024 09:27:11 -0800 Subject: [PATCH 07/21] validate inputs from prepack instead of ctx if needed --- .../contrib_ops/cpu/skip_layer_norm.cc | 26 ++++++++++++------- onnxruntime/contrib_ops/cpu/skip_layer_norm.h | 4 +++ 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc index 5bffe865b9c99..ded6992374fe6 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc @@ -184,7 +184,11 @@ SkipLayerNorm::SkipLayerNorm(const OpKernelInfo& op_kernel_info) prepacked_skip_fp32_data_(nullptr), prepacked_gamma_fp32_data_(nullptr), prepacked_beta_fp32_data_(nullptr), - prepacked_bias_fp32_data_(nullptr) { + prepacked_bias_fp32_data_(nullptr), + prepacked_skip_tensor_(nullptr), + prepacked_gamma_tensor_(nullptr), + prepacked_beta_tensor_(nullptr), + prepacked_bias_tensor_(nullptr) { ORT_ENFORCE(op_kernel_info.GetAttr("epsilon", &epsilon_).IsOK()); ORT_ENFORCE(epsilon_ >= 0); } @@ -192,10 +196,10 @@ SkipLayerNorm::SkipLayerNorm(const OpKernelInfo& op_kernel_info) template Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { const Tensor* input = p_ctx->Input(0); - const Tensor* skip = prepacked_skip_fp32_data_ ? nullptr : p_ctx->Input(1); - const Tensor* gamma = prepacked_gamma_fp32_data_ ? nullptr : p_ctx->Input(2); - const Tensor* beta = prepacked_beta_fp32_data_ ? nullptr : p_ctx->Input(3); - const Tensor* bias = prepacked_bias_fp32_data_ ? nullptr : p_ctx->Input(4); + const Tensor* skip = prepacked_skip_fp32_data_ ? prepacked_skip_tensor_ : p_ctx->Input(1); + const Tensor* gamma = prepacked_gamma_fp32_data_ ? prepacked_gamma_tensor_ : p_ctx->Input(2); + const Tensor* beta = prepacked_beta_fp32_data_ ? prepacked_beta_tensor_ : p_ctx->Input(3); + const Tensor* bias = prepacked_bias_fp32_data_ ? prepacked_bias_tensor_ : p_ctx->Input(4); Tensor* output = p_ctx->Output(0, input->Shape()); // For inferencing, we support one more optional output which is the sum of the input and skip tensors Tensor* skip_input_bias_add_output = p_ctx->Output(3, input->Shape()); @@ -215,10 +219,10 @@ Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { int64_t task_count = input->Shape().SizeToDimension(input_dims_size - 1); const T* input_data = input->Data(); - const T* skip_data = skip == nullptr ? nullptr : skip->Data(); - const T* gamma_data = gamma == nullptr ? nullptr : gamma->Data(); - const T* beta_data = beta == nullptr ? nullptr : beta->Data(); - const T* bias_data = bias == nullptr ? nullptr : bias->Data(); + const T* skip_data = prepacked_skip_fp32_data_ ? nullptr : skip->Data(); // skip is mandatory + const T* gamma_data = prepacked_gamma_fp32_data_ ? nullptr : gamma->Data(); // gamma is mandatory + const T* beta_data = (prepacked_beta_fp32_data_ || beta == nullptr) ? nullptr : beta->Data(); + const T* bias_data = (prepacked_bias_fp32_data_ || bias == nullptr) ? nullptr : bias->Data(); T* output_data = output->MutableData(); @@ -295,12 +299,16 @@ Status SkipLayerNorm::PrePack(const Tensor& tensor, int input_idx is_packed = false; if (input_idx == 1) { // skip ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, prepacked_skip_fp32_data_, is_packed); + prepacked_skip_tensor_ = &tensor; } else if (input_idx == 2) { // gamma ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, prepacked_gamma_fp32_data_, is_packed); + prepacked_gamma_tensor_ = &tensor; } else if (input_idx == 3) { // beta ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, prepacked_beta_fp32_data_, is_packed); + prepacked_beta_tensor_ = &tensor; } else if (input_idx == 4) { // bias ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, prepacked_bias_fp32_data_, is_packed); + prepacked_bias_tensor_ = &tensor; } return Status::OK(); diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.h b/onnxruntime/contrib_ops/cpu/skip_layer_norm.h index fcbb00ee93938..fd6480ad239c8 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.h +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.h @@ -25,6 +25,10 @@ class SkipLayerNorm final : public OpKernel { IAllocatorUniquePtr prepacked_gamma_fp32_data_; IAllocatorUniquePtr prepacked_beta_fp32_data_; IAllocatorUniquePtr prepacked_bias_fp32_data_; + const Tensor* prepacked_skip_tensor_; + const Tensor* prepacked_gamma_tensor_; + const Tensor* prepacked_beta_tensor_; + const Tensor* prepacked_bias_tensor_; }; } // namespace contrib From cd03bedb287eaec82181038676e839c892cc387e Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Wed, 6 Nov 2024 09:52:00 -0800 Subject: [PATCH 08/21] lint --- onnxruntime/contrib_ops/cpu/skip_layer_norm.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc index ded6992374fe6..fb64316d421fe 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc @@ -219,8 +219,8 @@ Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { int64_t task_count = input->Shape().SizeToDimension(input_dims_size - 1); const T* input_data = input->Data(); - const T* skip_data = prepacked_skip_fp32_data_ ? nullptr : skip->Data(); // skip is mandatory - const T* gamma_data = prepacked_gamma_fp32_data_ ? nullptr : gamma->Data(); // gamma is mandatory + const T* skip_data = prepacked_skip_fp32_data_ ? nullptr : skip->Data(); // skip is mandatory + const T* gamma_data = prepacked_gamma_fp32_data_ ? nullptr : gamma->Data(); // gamma is mandatory const T* beta_data = (prepacked_beta_fp32_data_ || beta == nullptr) ? nullptr : beta->Data(); const T* bias_data = (prepacked_bias_fp32_data_ || bias == nullptr) ? nullptr : bias->Data(); From 46118bfd1155e2cbe085a15c3dcf2a8b8ff3b9ae Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Thu, 7 Nov 2024 13:52:04 -0800 Subject: [PATCH 09/21] Revert "lint" This reverts commit cd03bedb287eaec82181038676e839c892cc387e. --- onnxruntime/contrib_ops/cpu/skip_layer_norm.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc index fb64316d421fe..ded6992374fe6 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc @@ -219,8 +219,8 @@ Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { int64_t task_count = input->Shape().SizeToDimension(input_dims_size - 1); const T* input_data = input->Data(); - const T* skip_data = prepacked_skip_fp32_data_ ? nullptr : skip->Data(); // skip is mandatory - const T* gamma_data = prepacked_gamma_fp32_data_ ? nullptr : gamma->Data(); // gamma is mandatory + const T* skip_data = prepacked_skip_fp32_data_ ? nullptr : skip->Data(); // skip is mandatory + const T* gamma_data = prepacked_gamma_fp32_data_ ? nullptr : gamma->Data(); // gamma is mandatory const T* beta_data = (prepacked_beta_fp32_data_ || beta == nullptr) ? nullptr : beta->Data(); const T* bias_data = (prepacked_bias_fp32_data_ || bias == nullptr) ? nullptr : bias->Data(); From 6ee262f1323ee9db2cad250d2e1d364c55c65192 Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Thu, 7 Nov 2024 13:52:26 -0800 Subject: [PATCH 10/21] Revert "validate inputs from prepack instead of ctx if needed" This reverts commit bc2c8207bde69de1de426bba10b8783401ef1d99. --- .../contrib_ops/cpu/skip_layer_norm.cc | 26 +++++++------------ onnxruntime/contrib_ops/cpu/skip_layer_norm.h | 4 --- 2 files changed, 9 insertions(+), 21 deletions(-) diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc index ded6992374fe6..5bffe865b9c99 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc @@ -184,11 +184,7 @@ SkipLayerNorm::SkipLayerNorm(const OpKernelInfo& op_kernel_info) prepacked_skip_fp32_data_(nullptr), prepacked_gamma_fp32_data_(nullptr), prepacked_beta_fp32_data_(nullptr), - prepacked_bias_fp32_data_(nullptr), - prepacked_skip_tensor_(nullptr), - prepacked_gamma_tensor_(nullptr), - prepacked_beta_tensor_(nullptr), - prepacked_bias_tensor_(nullptr) { + prepacked_bias_fp32_data_(nullptr) { ORT_ENFORCE(op_kernel_info.GetAttr("epsilon", &epsilon_).IsOK()); ORT_ENFORCE(epsilon_ >= 0); } @@ -196,10 +192,10 @@ SkipLayerNorm::SkipLayerNorm(const OpKernelInfo& op_kernel_info) template Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { const Tensor* input = p_ctx->Input(0); - const Tensor* skip = prepacked_skip_fp32_data_ ? prepacked_skip_tensor_ : p_ctx->Input(1); - const Tensor* gamma = prepacked_gamma_fp32_data_ ? prepacked_gamma_tensor_ : p_ctx->Input(2); - const Tensor* beta = prepacked_beta_fp32_data_ ? prepacked_beta_tensor_ : p_ctx->Input(3); - const Tensor* bias = prepacked_bias_fp32_data_ ? prepacked_bias_tensor_ : p_ctx->Input(4); + const Tensor* skip = prepacked_skip_fp32_data_ ? nullptr : p_ctx->Input(1); + const Tensor* gamma = prepacked_gamma_fp32_data_ ? nullptr : p_ctx->Input(2); + const Tensor* beta = prepacked_beta_fp32_data_ ? nullptr : p_ctx->Input(3); + const Tensor* bias = prepacked_bias_fp32_data_ ? nullptr : p_ctx->Input(4); Tensor* output = p_ctx->Output(0, input->Shape()); // For inferencing, we support one more optional output which is the sum of the input and skip tensors Tensor* skip_input_bias_add_output = p_ctx->Output(3, input->Shape()); @@ -219,10 +215,10 @@ Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { int64_t task_count = input->Shape().SizeToDimension(input_dims_size - 1); const T* input_data = input->Data(); - const T* skip_data = prepacked_skip_fp32_data_ ? nullptr : skip->Data(); // skip is mandatory - const T* gamma_data = prepacked_gamma_fp32_data_ ? nullptr : gamma->Data(); // gamma is mandatory - const T* beta_data = (prepacked_beta_fp32_data_ || beta == nullptr) ? nullptr : beta->Data(); - const T* bias_data = (prepacked_bias_fp32_data_ || bias == nullptr) ? nullptr : bias->Data(); + const T* skip_data = skip == nullptr ? nullptr : skip->Data(); + const T* gamma_data = gamma == nullptr ? nullptr : gamma->Data(); + const T* beta_data = beta == nullptr ? nullptr : beta->Data(); + const T* bias_data = bias == nullptr ? nullptr : bias->Data(); T* output_data = output->MutableData(); @@ -299,16 +295,12 @@ Status SkipLayerNorm::PrePack(const Tensor& tensor, int input_idx is_packed = false; if (input_idx == 1) { // skip ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, prepacked_skip_fp32_data_, is_packed); - prepacked_skip_tensor_ = &tensor; } else if (input_idx == 2) { // gamma ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, prepacked_gamma_fp32_data_, is_packed); - prepacked_gamma_tensor_ = &tensor; } else if (input_idx == 3) { // beta ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, prepacked_beta_fp32_data_, is_packed); - prepacked_beta_tensor_ = &tensor; } else if (input_idx == 4) { // bias ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, prepacked_bias_fp32_data_, is_packed); - prepacked_bias_tensor_ = &tensor; } return Status::OK(); diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.h b/onnxruntime/contrib_ops/cpu/skip_layer_norm.h index fd6480ad239c8..fcbb00ee93938 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.h +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.h @@ -25,10 +25,6 @@ class SkipLayerNorm final : public OpKernel { IAllocatorUniquePtr prepacked_gamma_fp32_data_; IAllocatorUniquePtr prepacked_beta_fp32_data_; IAllocatorUniquePtr prepacked_bias_fp32_data_; - const Tensor* prepacked_skip_tensor_; - const Tensor* prepacked_gamma_tensor_; - const Tensor* prepacked_beta_tensor_; - const Tensor* prepacked_bias_tensor_; }; } // namespace contrib From b6645d549a04fdc52c842045d7b1da161b4e2097 Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Thu, 7 Nov 2024 15:01:59 -0800 Subject: [PATCH 11/21] update input validations to account for prepacked tensors --- .../contrib_ops/cpu/skip_layer_norm.cc | 16 ++- .../contrib_ops/cpu/skip_layer_norm_helper.h | 121 ++++++++++++++++-- 2 files changed, 118 insertions(+), 19 deletions(-) diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc index 5bffe865b9c99..69ef248a5e226 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc @@ -204,13 +204,15 @@ Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { size_t input_dims_size = input_dims.size(); int hidden_size = static_cast(input_dims[input_dims_size - 1]); - ORT_RETURN_IF_ERROR(onnxruntime::contrib::skip_layer_norm_helper::CheckInputs(input, - skip, - gamma, - beta, - bias, - hidden_size, - input_dims_size)); + ORT_RETURN_IF_ERROR(skip_layer_norm_helper::CheckPotentiallyPrepackedInputs(input, + skip, + gamma, + beta, + bias, + hidden_size, + input_dims_size, + bool(prepacked_skip_fp32_data_), + bool(prepacked_gamma_fp32_data_))); int64_t task_count = input->Shape().SizeToDimension(input_dims_size - 1); diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm_helper.h b/onnxruntime/contrib_ops/cpu/skip_layer_norm_helper.h index 6271f822287e6..c7c5c371df1db 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm_helper.h +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm_helper.h @@ -11,14 +11,10 @@ namespace onnxruntime { namespace contrib { namespace skip_layer_norm_helper { +namespace { + template -Status CheckInputs(const T* input, - const T* skip, - const T* gamma, - const T* beta, - const T* bias, - int hidden_size_check, - size_t input_dims_size_check) { +Status CheckSkip(const T* input, const T* skip, int hidden_size_check, size_t input_dims_size_check) { const auto& input_dims_check = input->Shape().GetDims(); const auto& skip_dims_check = skip->Shape().GetDims(); size_t skip_dims_size_check = skip_dims_check.size(); @@ -33,49 +29,150 @@ Status CheckInputs(const T* input, "skip is expected to have same shape as input or, a batch size of 1 or no batch size when input has 3 dimensions"); } - if (input_dims_size_check != 3 && input_dims_size_check != 2) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, - "input is expected to have 3 or 2 dimensions, got ", input_dims_size_check); - } - if (skip_dims_check[skip_dims_size_check - 1] != input_dims_check[input_dims_size_check - 1] || skip_dims_check[skip_dims_size_check - 2] != input_dims_check[input_dims_size_check - 2]) { return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "last two dimensions of skip needs to be same as input"); } + return Status::OK(); +} + +template +Status CheckGamma(const T* gamma, int hidden_size_check) { const auto& gamma_dims = gamma->Shape().GetDims(); + if (gamma_dims.size() != 1) { return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "gamma is expected to have 1 dimension, got ", gamma_dims.size()); } + if (gamma_dims[0] != hidden_size_check) { return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Last dimension of gamma and input does not match"); } + return Status::OK(); +} + +template +Status CheckBeta(const T* beta, int hidden_size_check) { if (nullptr != beta) { const auto& beta_dims = beta->Shape().GetDims(); + if (beta_dims.size() != 1) { return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "beta is expected to have 1 dimension, got ", beta_dims.size()); } + if (beta_dims[0] != hidden_size_check) { return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Last dimension of beta and input does not match"); } } + return Status::OK(); +} + +template +Status CheckBias(const T* bias, int hidden_size_check) { if (nullptr != bias) { const auto& bias_dims = bias->Shape().GetDims(); + if (bias_dims.size() != 1) { return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "bias is expected to have 1 dimension, got ", bias_dims.size()); } + if (bias_dims[0] != hidden_size_check) { return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Last dimension of bias and input does not match"); } } + + return Status::OK(); +} + +} // anonymous namespace + +template +Status CheckInputs(const T* input, + const T* skip, + const T* gamma, + const T* beta, + const T* bias, + int hidden_size_check, + size_t input_dims_size_check) { + if (input_dims_size_check != 3 && input_dims_size_check != 2) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "input is expected to have 3 or 2 dimensions, got ", input_dims_size_check); + } + + auto status = CheckSkip(input, skip, hidden_size_check, input_dims_size_check); + if (status != Status::OK()) { + return status; + } + + status = CheckGamma(gamma, hidden_size_check); + if (status != Status::OK()) { + return status; + } + + status = CheckBeta(beta, hidden_size_check); + if (status != Status::OK()) { + return status; + } + + status = CheckBias(bias, hidden_size_check); + if (status != Status::OK()) { + return status; + } + + return Status::OK(); +} + +template +Status CheckPotentiallyPrepackedInputs(const T* input, + const T* skip, + const T* gamma, + const T* beta, + const T* bias, + int hidden_size_check, + size_t input_dims_size_check + bool prepacked_skip, + bool prepacked_gamma) { + if (input_dims_size_check != 3 && input_dims_size_check != 2) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "input is expected to have 3 or 2 dimensions, got ", input_dims_size_check); + } + + if (nullptr != skip) { + auto status = CheckSkip(input, skip, hidden_size_check, input_dims_size_check); + if (status != Status::OK()) { + return status; + } + } else if (!prepacked_skip) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "skip is expected but not provided"); + } + + if (nullptr != gamma) { + auto status = CheckGamma(gamma, hidden_size_check); + if (status != Status::OK()) { + return status; + } + } else if (!prepacked_gamma) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "gamma is expected but not provided"); + } + + auto status = CheckBeta(beta, hidden_size_check); + if (status != Status::OK()) { + return status; + } + + status = CheckBias(bias, hidden_size_check); + if (status != Status::OK()) { + return status; + } + return Status::OK(); } From bf13b74df5418870ee8dab242f8d7290e4608088 Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Thu, 7 Nov 2024 15:08:43 -0800 Subject: [PATCH 12/21] small fix --- onnxruntime/contrib_ops/cpu/skip_layer_norm_helper.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm_helper.h b/onnxruntime/contrib_ops/cpu/skip_layer_norm_helper.h index c7c5c371df1db..cafdaa29142c7 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm_helper.h +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm_helper.h @@ -137,7 +137,7 @@ Status CheckPotentiallyPrepackedInputs(const T* input, const T* beta, const T* bias, int hidden_size_check, - size_t input_dims_size_check + size_t input_dims_size_check, bool prepacked_skip, bool prepacked_gamma) { if (input_dims_size_check != 3 && input_dims_size_check != 2) { From d3ae5ef3516ffe0416083456aea87a381a4e2096 Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Thu, 7 Nov 2024 15:29:56 -0800 Subject: [PATCH 13/21] remove unused param --- onnxruntime/contrib_ops/cpu/skip_layer_norm_helper.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm_helper.h b/onnxruntime/contrib_ops/cpu/skip_layer_norm_helper.h index cafdaa29142c7..4c901f5650dbd 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm_helper.h +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm_helper.h @@ -14,7 +14,7 @@ namespace skip_layer_norm_helper { namespace { template -Status CheckSkip(const T* input, const T* skip, int hidden_size_check, size_t input_dims_size_check) { +Status CheckSkip(const T* input, const T* skip, size_t input_dims_size_check) { const auto& input_dims_check = input->Shape().GetDims(); const auto& skip_dims_check = skip->Shape().GetDims(); size_t skip_dims_size_check = skip_dims_check.size(); @@ -107,7 +107,7 @@ Status CheckInputs(const T* input, "input is expected to have 3 or 2 dimensions, got ", input_dims_size_check); } - auto status = CheckSkip(input, skip, hidden_size_check, input_dims_size_check); + auto status = CheckSkip(input, skip, input_dims_size_check); if (status != Status::OK()) { return status; } @@ -146,7 +146,7 @@ Status CheckPotentiallyPrepackedInputs(const T* input, } if (nullptr != skip) { - auto status = CheckSkip(input, skip, hidden_size_check, input_dims_size_check); + auto status = CheckSkip(input, skip, input_dims_size_check); if (status != Status::OK()) { return status; } From 96e5bfb0a861773375c51081de837e3bbbc9a39a Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Thu, 7 Nov 2024 18:10:08 -0800 Subject: [PATCH 14/21] update based on comments --- .../contrib_ops/cpu/skip_layer_norm.cc | 35 +++++++++---------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc index 69ef248a5e226..1b40765addc8b 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc @@ -99,19 +99,19 @@ void ComputeJob( void ComputeJob( const MLFloat16* input_data, const MLFloat16* skip_data, + const IAllocatorUniquePtr& prepacked_skip_fp32_data, const float* gamma_float_ptr, const float* beta_float_ptr, const float* bias_float_ptr, - float* input_float_ptr, float* output_float_ptr, - float* skip_float_ptr, ptrdiff_t task_idx, int hidden_size, int64_t skip_size, float epsilon, bool simplified, MLFloat16* output_data, - MLFloat16* skip_input_bias_add_output_data) { + MLFloat16* skip_input_bias_add_output_data, + AllocatorPtr alloc) { auto offset = task_idx * hidden_size; const MLFloat16* p_input = input_data + offset; MLFloat16* p_output = output_data + offset; @@ -121,13 +121,18 @@ void ComputeJob( float mean_square(0.0f); const size_t num_elems = static_cast(hidden_size); - MlasConvertHalfToFloatBuffer(p_input, input_float_ptr, num_elems); + IAllocatorUniquePtr input_float_uptr = IAllocator::MakeUniquePtr(alloc, num_elems); + MlasConvertHalfToFloatBuffer(p_input, input_float_uptr.get(), num_elems); - if (skip_data) { + IAllocatorUniquePtr skip_float_uptr = nullptr; + if (prepacked_skip_fp32_data == nullptr && skip_data) { const MLFloat16* p_skip = skip_data + (offset % skip_size); - MlasConvertHalfToFloatBuffer(p_skip, skip_float_ptr, num_elems); + skip_float_uptr = IAllocator::MakeUniquePtr(alloc, num_elems); + MlasConvertHalfToFloatBuffer(p_skip, skip_float_uptr.get(), num_elems); } + const float* input_float_ptr = input_float_uptr.get(); + const float* skip_float_ptr = prepacked_skip_fp32_data ? prepacked_skip_fp32_data.get() : skip_float_uptr.get(); for (size_t h = 0; h < num_elems; h++) { float val = input_float_ptr[h] + skip_float_ptr[h]; @@ -211,8 +216,8 @@ Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { bias, hidden_size, input_dims_size, - bool(prepacked_skip_fp32_data_), - bool(prepacked_gamma_fp32_data_))); + prepacked_skip_fp32_data_ != nullptr, + prepacked_gamma_fp32_data_ != nullptr)); int64_t task_count = input->Shape().SizeToDimension(input_dims_size - 1); @@ -232,9 +237,7 @@ Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { AllocatorPtr alloc; ORT_RETURN_IF_ERROR(p_ctx->GetTempSpaceAllocator(&alloc)); - IAllocatorUniquePtr input_fp32; IAllocatorUniquePtr output_fp32; - IAllocatorUniquePtr skip_fp32; IAllocatorUniquePtr gamma_fp32; IAllocatorUniquePtr beta_fp32; IAllocatorUniquePtr bias_fp32; @@ -242,14 +245,8 @@ Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { if constexpr (std::is_same_v) { const size_t num_elems = static_cast(hidden_size); - input_fp32 = IAllocator::MakeUniquePtr(alloc, num_elems); output_fp32 = IAllocator::MakeUniquePtr(alloc, num_elems); - if (prepacked_skip_fp32_data_ == nullptr && skip_data) { - skip_fp32 = IAllocator::MakeUniquePtr(alloc, num_elems); - // skip data will be converted inside ComputeJob, because it needs to use an offset based on task_idx. - } - if (prepacked_gamma_fp32_data_ == nullptr && gamma_data) { gamma_fp32 = IAllocator::MakeUniquePtr(alloc, num_elems); MlasConvertHalfToFloatBuffer(gamma_data, gamma_fp32.get(), num_elems); @@ -271,13 +268,13 @@ Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { [&](ptrdiff_t task_idx) { if constexpr (std::is_same_v) { ComputeJob(input_data, skip_data, + prepacked_skip_fp32_data_, prepacked_gamma_fp32_data_ ? prepacked_gamma_fp32_data_.get() : gamma_fp32.get(), prepacked_beta_fp32_data_ ? prepacked_beta_fp32_data_.get() : beta_fp32.get(), prepacked_bias_fp32_data_ ? prepacked_bias_fp32_data_.get() : bias_fp32.get(), - input_fp32.get(), output_fp32.get(), - prepacked_skip_fp32_data_ ? prepacked_skip_fp32_data_.get() : skip_fp32.get(), + output_fp32.get(), task_idx, hidden_size, skip_size, epsilon_, simplified, output_data, - skip_input_bias_add_output_data); + skip_input_bias_add_output_data, alloc); } else { ComputeJob(input_data, skip_data, gamma_data, beta_data, bias_data, task_idx, hidden_size, skip_size, epsilon_, simplified, output_data, skip_input_bias_add_output_data); From fc05afc4b31411c6646eb6e36c0842dd43e952a3 Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Thu, 7 Nov 2024 18:48:00 -0800 Subject: [PATCH 15/21] pass raw ptr instead of unique ptr --- onnxruntime/contrib_ops/cpu/skip_layer_norm.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc index 1b40765addc8b..5e058697538d7 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc @@ -99,7 +99,7 @@ void ComputeJob( void ComputeJob( const MLFloat16* input_data, const MLFloat16* skip_data, - const IAllocatorUniquePtr& prepacked_skip_fp32_data, + const float* prepacked_skip_fp32_data, const float* gamma_float_ptr, const float* beta_float_ptr, const float* bias_float_ptr, @@ -132,7 +132,7 @@ void ComputeJob( } const float* input_float_ptr = input_float_uptr.get(); - const float* skip_float_ptr = prepacked_skip_fp32_data ? prepacked_skip_fp32_data.get() : skip_float_uptr.get(); + const float* skip_float_ptr = prepacked_skip_fp32_data ? prepacked_skip_fp32_data : skip_float_uptr.get(); for (size_t h = 0; h < num_elems; h++) { float val = input_float_ptr[h] + skip_float_ptr[h]; @@ -268,7 +268,7 @@ Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { [&](ptrdiff_t task_idx) { if constexpr (std::is_same_v) { ComputeJob(input_data, skip_data, - prepacked_skip_fp32_data_, + prepacked_skip_fp32_data_.get(), prepacked_gamma_fp32_data_ ? prepacked_gamma_fp32_data_.get() : gamma_fp32.get(), prepacked_beta_fp32_data_ ? prepacked_beta_fp32_data_.get() : beta_fp32.get(), prepacked_bias_fp32_data_ ? prepacked_bias_fp32_data_.get() : bias_fp32.get(), From 047b98af80d3b396c9976e33b1cf5629230acbd5 Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Fri, 8 Nov 2024 12:13:01 -0800 Subject: [PATCH 16/21] Add skip layer norm unit tests --- .../test/contrib_ops/skiplayernorm_op_test.cc | 42 +++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/onnxruntime/test/contrib_ops/skiplayernorm_op_test.cc b/onnxruntime/test/contrib_ops/skiplayernorm_op_test.cc index b9ca55073d411..ea2b967bf736e 100644 --- a/onnxruntime/test/contrib_ops/skiplayernorm_op_test.cc +++ b/onnxruntime/test/contrib_ops/skiplayernorm_op_test.cc @@ -194,6 +194,48 @@ static void RunTest( } } +TEST(SkipLayerNormTest, SkipLayerNormPrePack) { + OpTester test("SkipLayerNormalization", 1, onnxruntime::kMSDomain); + test.AddAttribute("epsilon", 1e-05f); + + int batch_size = 1; + int sequence_length = 2; + int hidden_size = 2; + std::vector input_skip_output_dims = {batch_size, sequence_length, hidden_size}; + std::vector gamma_beta_bias_dims = {hidden_size}; + test.AddInput("x", input_skip_output_dims, ToFloat16({1.f, 1.f, 1.f, 1.f})); + test.AddInput("skip", input_skip_output_dims, ToFloat16({1.f, 1.f, 1.f, 1.f})); + test.AddInput("gamma", gamma_beta_bias_dims, ToFloat16({1.f, 1.f}), true); + test.AddInput("beta", gamma_beta_bias_dims, ToFloat16({1.f, 1.f}), true); + test.AddOutput("output", input_skip_output_dims, ToFloat16({1.f, 1.f, 1.f, 1.f,})); + + // TRT, DNNL, OpenVINO and NNAPI, CoreML don't support this combination of datatypes + test.Run(OpTester::ExpectResult::kExpectSuccess, "", + {kTensorrtExecutionProvider, kDnnlExecutionProvider, kOpenVINOExecutionProvider, + kNnapiExecutionProvider, kQnnExecutionProvider}); +} + +TEST(SkipLayerNormTest, SkipSimplifiedLayerNormPrePack) { + OpTester test("SkipSimplifiedLayerNormalization", 1, onnxruntime::kMSDomain); + test.AddAttribute("epsilon", 1e-05f); + + int batch_size = 1; + int sequence_length = 2; + int hidden_size = 2; + std::vector input_skip_output_dims = {batch_size, sequence_length, hidden_size}; + std::vector gamma_beta_bias_dims = {hidden_size}; + test.AddInput("x", input_skip_output_dims, ToFloat16({1.f, 1.f, 1.f, 1.f})); + test.AddInput("skip", input_skip_output_dims, ToFloat16({1.f, 1.f, 1.f, 1.f})); + test.AddInput("gamma", gamma_beta_bias_dims, ToFloat16({1.f, 1.f}), true); + test.AddInput("bias", gamma_beta_bias_dims, ToFloat16({1.f, 1.f}), true); + test.AddOutput("output", input_skip_output_dims, ToFloat16({1.f, 1.f, 1.f, 1.f,})); + + // TRT, DNNL, OpenVINO and NNAPI, CoreML don't support this combination of datatypes + test.Run(OpTester::ExpectResult::kExpectSuccess, "", + {kTensorrtExecutionProvider, kDnnlExecutionProvider, kOpenVINOExecutionProvider, + kNnapiExecutionProvider, kQnnExecutionProvider}); +} + TEST(SkipLayerNormTest, SkipLayerNormNullInput) { int batch_size = 1; int sequence_length = 0; From ed6c1e14bfd1242f8e5c00289bcf90c03c5c027d Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Fri, 8 Nov 2024 12:23:28 -0800 Subject: [PATCH 17/21] lint --- .../test/contrib_ops/skiplayernorm_op_test.cc | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/onnxruntime/test/contrib_ops/skiplayernorm_op_test.cc b/onnxruntime/test/contrib_ops/skiplayernorm_op_test.cc index ea2b967bf736e..ed468589e3f11 100644 --- a/onnxruntime/test/contrib_ops/skiplayernorm_op_test.cc +++ b/onnxruntime/test/contrib_ops/skiplayernorm_op_test.cc @@ -207,7 +207,12 @@ TEST(SkipLayerNormTest, SkipLayerNormPrePack) { test.AddInput("skip", input_skip_output_dims, ToFloat16({1.f, 1.f, 1.f, 1.f})); test.AddInput("gamma", gamma_beta_bias_dims, ToFloat16({1.f, 1.f}), true); test.AddInput("beta", gamma_beta_bias_dims, ToFloat16({1.f, 1.f}), true); - test.AddOutput("output", input_skip_output_dims, ToFloat16({1.f, 1.f, 1.f, 1.f,})); + test.AddOutput("output", input_skip_output_dims, ToFloat16({ + 1.f, + 1.f, + 1.f, + 1.f, + })); // TRT, DNNL, OpenVINO and NNAPI, CoreML don't support this combination of datatypes test.Run(OpTester::ExpectResult::kExpectSuccess, "", @@ -228,7 +233,12 @@ TEST(SkipLayerNormTest, SkipSimplifiedLayerNormPrePack) { test.AddInput("skip", input_skip_output_dims, ToFloat16({1.f, 1.f, 1.f, 1.f})); test.AddInput("gamma", gamma_beta_bias_dims, ToFloat16({1.f, 1.f}), true); test.AddInput("bias", gamma_beta_bias_dims, ToFloat16({1.f, 1.f}), true); - test.AddOutput("output", input_skip_output_dims, ToFloat16({1.f, 1.f, 1.f, 1.f,})); + test.AddOutput("output", input_skip_output_dims, ToFloat16({ + 1.f, + 1.f, + 1.f, + 1.f, + })); // TRT, DNNL, OpenVINO and NNAPI, CoreML don't support this combination of datatypes test.Run(OpTester::ExpectResult::kExpectSuccess, "", From 0bdd595670e1189baa0e29fd2c1198e8a3716cd8 Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Fri, 8 Nov 2024 14:00:01 -0800 Subject: [PATCH 18/21] Fix pipeline issue --- onnxruntime/test/contrib_ops/skiplayernorm_op_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onnxruntime/test/contrib_ops/skiplayernorm_op_test.cc b/onnxruntime/test/contrib_ops/skiplayernorm_op_test.cc index ed468589e3f11..e04385fb2c838 100644 --- a/onnxruntime/test/contrib_ops/skiplayernorm_op_test.cc +++ b/onnxruntime/test/contrib_ops/skiplayernorm_op_test.cc @@ -232,7 +232,7 @@ TEST(SkipLayerNormTest, SkipSimplifiedLayerNormPrePack) { test.AddInput("x", input_skip_output_dims, ToFloat16({1.f, 1.f, 1.f, 1.f})); test.AddInput("skip", input_skip_output_dims, ToFloat16({1.f, 1.f, 1.f, 1.f})); test.AddInput("gamma", gamma_beta_bias_dims, ToFloat16({1.f, 1.f}), true); - test.AddInput("bias", gamma_beta_bias_dims, ToFloat16({1.f, 1.f}), true); + test.AddInput("beta", gamma_beta_bias_dims, ToFloat16({1.f, 1.f}), true); test.AddOutput("output", input_skip_output_dims, ToFloat16({ 1.f, 1.f, From 321a08bc5478e6e95d5d9ff1649f89660efd0df2 Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Fri, 8 Nov 2024 19:01:35 -0800 Subject: [PATCH 19/21] update test --- .../test/contrib_ops/skiplayernorm_op_test.cc | 20 ++++++++----------- 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/onnxruntime/test/contrib_ops/skiplayernorm_op_test.cc b/onnxruntime/test/contrib_ops/skiplayernorm_op_test.cc index e04385fb2c838..c76597540714a 100644 --- a/onnxruntime/test/contrib_ops/skiplayernorm_op_test.cc +++ b/onnxruntime/test/contrib_ops/skiplayernorm_op_test.cc @@ -225,20 +225,16 @@ TEST(SkipLayerNormTest, SkipSimplifiedLayerNormPrePack) { test.AddAttribute("epsilon", 1e-05f); int batch_size = 1; - int sequence_length = 2; - int hidden_size = 2; + int sequence_length = 1; + int hidden_size = 1; std::vector input_skip_output_dims = {batch_size, sequence_length, hidden_size}; std::vector gamma_beta_bias_dims = {hidden_size}; - test.AddInput("x", input_skip_output_dims, ToFloat16({1.f, 1.f, 1.f, 1.f})); - test.AddInput("skip", input_skip_output_dims, ToFloat16({1.f, 1.f, 1.f, 1.f})); - test.AddInput("gamma", gamma_beta_bias_dims, ToFloat16({1.f, 1.f}), true); - test.AddInput("beta", gamma_beta_bias_dims, ToFloat16({1.f, 1.f}), true); - test.AddOutput("output", input_skip_output_dims, ToFloat16({ - 1.f, - 1.f, - 1.f, - 1.f, - })); + std::vector one_float16 = ToFloat16({1.f}); + test.AddInput("x", input_skip_output_dims, one_float16); + test.AddInput("skip", input_skip_output_dims, one_float16); + test.AddInput("gamma", gamma_beta_bias_dims, one_float16, true); + test.AddInput("beta", gamma_beta_bias_dims, one_float16, true); + test.AddOutput("output", input_skip_output_dims, one_float16); // TRT, DNNL, OpenVINO and NNAPI, CoreML don't support this combination of datatypes test.Run(OpTester::ExpectResult::kExpectSuccess, "", From dcb642385c373429a033fca9d7ac86b5543f4757 Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Mon, 11 Nov 2024 11:23:08 -0800 Subject: [PATCH 20/21] handle prepacked skip size --- onnxruntime/contrib_ops/cpu/skip_layer_norm.cc | 4 +++- onnxruntime/contrib_ops/cpu/skip_layer_norm.h | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc index 5e058697538d7..12af1ac784209 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc @@ -186,6 +186,7 @@ void ConvertMLFloat16ToFloatIfNeeded(const Tensor& tensor, AllocatorPtr alloc, I template SkipLayerNorm::SkipLayerNorm(const OpKernelInfo& op_kernel_info) : OpKernel(op_kernel_info), + prepacked_skip_fp32_size_(0), prepacked_skip_fp32_data_(nullptr), prepacked_gamma_fp32_data_(nullptr), prepacked_beta_fp32_data_(nullptr), @@ -232,7 +233,7 @@ Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { // For inferencing, we support one more optional output which is the sum of the input and skip tensors T* skip_input_bias_add_output_data = skip_input_bias_add_output == nullptr ? nullptr : skip_input_bias_add_output->MutableData(); - const int64_t& skip_size = skip->Shape().Size(); + const int64_t skip_size = skip ? skip->Shape().Size() : prepacked_skip_fp32_size_; AllocatorPtr alloc; ORT_RETURN_IF_ERROR(p_ctx->GetTempSpaceAllocator(&alloc)); @@ -293,6 +294,7 @@ Status SkipLayerNorm::PrePack(const Tensor& tensor, int input_idx is_packed = false; if (input_idx == 1) { // skip + prepacked_skip_fp32_size_ = tensor.Shape().Size(); ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, prepacked_skip_fp32_data_, is_packed); } else if (input_idx == 2) { // gamma ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, prepacked_gamma_fp32_data_, is_packed); diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.h b/onnxruntime/contrib_ops/cpu/skip_layer_norm.h index fcbb00ee93938..e725f648fe275 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.h +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.h @@ -21,6 +21,7 @@ class SkipLayerNorm final : public OpKernel { private: float epsilon_; + int64_t prepacked_skip_fp32_size_; IAllocatorUniquePtr prepacked_skip_fp32_data_; IAllocatorUniquePtr prepacked_gamma_fp32_data_; IAllocatorUniquePtr prepacked_beta_fp32_data_; From 35c8ce67d79aba2f04cf1191864d4237f796ca2f Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Mon, 11 Nov 2024 13:21:14 -0800 Subject: [PATCH 21/21] fix pipeline issue --- .../test/contrib_ops/skiplayernorm_op_test.cc | 22 ------------------- 1 file changed, 22 deletions(-) diff --git a/onnxruntime/test/contrib_ops/skiplayernorm_op_test.cc b/onnxruntime/test/contrib_ops/skiplayernorm_op_test.cc index c76597540714a..4e8d1b9f016f0 100644 --- a/onnxruntime/test/contrib_ops/skiplayernorm_op_test.cc +++ b/onnxruntime/test/contrib_ops/skiplayernorm_op_test.cc @@ -220,28 +220,6 @@ TEST(SkipLayerNormTest, SkipLayerNormPrePack) { kNnapiExecutionProvider, kQnnExecutionProvider}); } -TEST(SkipLayerNormTest, SkipSimplifiedLayerNormPrePack) { - OpTester test("SkipSimplifiedLayerNormalization", 1, onnxruntime::kMSDomain); - test.AddAttribute("epsilon", 1e-05f); - - int batch_size = 1; - int sequence_length = 1; - int hidden_size = 1; - std::vector input_skip_output_dims = {batch_size, sequence_length, hidden_size}; - std::vector gamma_beta_bias_dims = {hidden_size}; - std::vector one_float16 = ToFloat16({1.f}); - test.AddInput("x", input_skip_output_dims, one_float16); - test.AddInput("skip", input_skip_output_dims, one_float16); - test.AddInput("gamma", gamma_beta_bias_dims, one_float16, true); - test.AddInput("beta", gamma_beta_bias_dims, one_float16, true); - test.AddOutput("output", input_skip_output_dims, one_float16); - - // TRT, DNNL, OpenVINO and NNAPI, CoreML don't support this combination of datatypes - test.Run(OpTester::ExpectResult::kExpectSuccess, "", - {kTensorrtExecutionProvider, kDnnlExecutionProvider, kOpenVINOExecutionProvider, - kNnapiExecutionProvider, kQnnExecutionProvider}); -} - TEST(SkipLayerNormTest, SkipLayerNormNullInput) { int batch_size = 1; int sequence_length = 0;