diff --git a/CMakeLists.txt b/CMakeLists.txt
index 321bddd999..cb99dee99e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -28,7 +28,7 @@ cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
 list(APPEND CMAKE_MESSAGE_CONTEXT ArmCompute)
 project(
   ArmCompute
-  VERSION 40.0.0
+  VERSION 41.0.0
   DESCRIPTION
     "The Arm Compute Library is a collection of low-level machine learning functions optimized for Arm® Cortex®-A CPU and Arm® Mali™ GPU architectures"
   LANGUAGES C CXX ASM)
diff --git a/README.md b/README.md
index 71c4642ed8..64474b9890 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@
  <img src="https://raw.githubusercontent.com/ARM-software/ComputeLibrary/gh-pages/ACL_logo.png"/><br><br>
 </div>
 
-# Compute Library ![](https://img.shields.io/badge/latest_release-24.08-green)
+# Compute Library ![](https://img.shields.io/badge/latest_release-24.08.1-green)
 
 
 The Compute Library is a collection of low-level machine learning functions optimized for Arm® Cortex®-A, Arm® Neoverse® and Arm® Mali™ GPUs architectures.<br>
@@ -37,7 +37,7 @@ Key Features:
 <br>
 
 ## Documentation
-[![Documentation](https://img.shields.io/badge/documentation-24.08-green)](https://artificial-intelligence.sites.arm.com/computelibrary/v24.08/index.xhtml)
+[![Documentation](https://img.shields.io/badge/documentation-24.08.1-green)](https://artificial-intelligence.sites.arm.com/computelibrary/v24.08.1/index.xhtml)
 
 > Note: The documentation includes the reference API, changelogs, build guide, contribution guide, errata, etc.
 
@@ -50,22 +50,22 @@ All the binaries can be downloaded from [here](https://github.com/ARM-software/C
 
 | Platform       | Operating System | Release archive (Download) |
 | -------------- | ---------------- | -------------------------- |
-| Raspberry Pi 4 | Linux® 32bit      | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08/arm_compute-v24.08-linux-armv7a-cpu-bin.tar.gz) |
-| Raspberry Pi 4 | Linux® 64bit      | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08/arm_compute-v24.08-linux-aarch64-cpu-bin.tar.gz) |
-| Odroid N2      | Linux® 64bit      | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08/arm_compute-v24.08-linux-aarch64-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08/arm_compute-v24.08-linux-aarch64-cpu-gpu-bin.tar.gz) |
-| HiKey960       | Linux® 64bit      | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08/arm_compute-v24.08-linux-aarch64-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08/arm_compute-v24.08-linux-aarch64-cpu-gpu-bin.tar.gz) |
+| Raspberry Pi 4 | Linux® 32bit      | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08.1/arm_compute-v24.08.1-linux-armv7a-cpu-bin.tar.gz) |
+| Raspberry Pi 4 | Linux® 64bit      | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08.1/arm_compute-v24.08.1-linux-aarch64-cpu-bin.tar.gz) |
+| Odroid N2      | Linux® 64bit      | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08.1/arm_compute-v24.08.1-linux-aarch64-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08.1/arm_compute-v24.08.1-linux-aarch64-cpu-gpu-bin.tar.gz) |
+| HiKey960       | Linux® 64bit      | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08.1/arm_compute-v24.08.1-linux-aarch64-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08.1/arm_compute-v24.08.1-linux-aarch64-cpu-gpu-bin.tar.gz) |
 
 <br>
 
 | Architecture | Operating System | Release archive (Download) |
 | ------------ | ---------------- | -------------------------- |
-| armv7        | Linux®            | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08/arm_compute-v24.08-linux-armv7a-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08/arm_compute-v24.08-linux-armv7a-cpu-gpu-bin.tar.gz) |
-| arm64-v8a    | Android™          | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08/arm_compute-v24.08-android-aarch64-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08/arm_compute-v24.08-android-aarch64-cpu-gpu-bin.tar.gz) |
-| arm64-v8a    | Linux®            | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08/arm_compute-v24.08-linux-aarch64-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08/arm_compute-v24.08-linux-aarch64-cpu-gpu-bin.tar.gz) |
+| armv7        | Linux®            | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08.1/arm_compute-v24.08.1-linux-armv7a-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08.1/arm_compute-v24.08.1-linux-armv7a-cpu-gpu-bin.tar.gz) |
+| arm64-v8a    | Android™          | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08.1/arm_compute-v24.08.1-android-aarch64-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08.1/arm_compute-v24.08.1-android-aarch64-cpu-gpu-bin.tar.gz) |
+| arm64-v8a    | Linux®            | [![](https://img.shields.io/badge/build-neon-orange)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08.1/arm_compute-v24.08.1-linux-aarch64-cpu-bin.tar.gz) [![](https://img.shields.io/badge/build-neon+cl-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/download/v24.08.1/arm_compute-v24.08.1-linux-aarch64-cpu-gpu-bin.tar.gz) |
 
 <br>
 
-Please refer to the following link for more pre-built binaries: [![](https://img.shields.io/badge/v24.08-bins-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/tag/v24.08)
+Please refer to the following link for more pre-built binaries: [![](https://img.shields.io/badge/v24.08.1-bins-yellowgreen)](https://github.com/ARM-software/ComputeLibrary/releases/tag/v24.08.1)
 
 Pre-build binaries are generated with the following security / good coding practices related flags:
 > -Wall, -Wextra, -Wformat=2, -Winit-self, -Wstrict-overflow=2, -Wswitch-default, -Woverloaded-virtual, -Wformat-security, -Wctor-dtor-privacy, -Wsign-promo, -Weffc++, -pedantic, -fstack-protector-strong
@@ -107,13 +107,13 @@ Pre-build binaries are generated with the following security / good coding pract
 
 ## Experimental builds
 
-**⚠ Important** Bazel and CMake builds are experimental CPU only builds, please see the [documentation](https://artificial-intelligence.sites.arm.com/computelibrary/v24.08/how_to_build.xhtml) for more details.
+**⚠ Important** Bazel and CMake builds are experimental CPU only builds, please see the [documentation](https://artificial-intelligence.sites.arm.com/computelibrary/v24.08.1/how_to_build.xhtml) for more details.
 
 <br>
 
 ## How to contribute
 
-Contributions to the Compute Library are more than welcome. If you are interested on contributing, please have a look at our [how to contribute guidelines](https://artificial-intelligence.sites.arm.com/computelibrary/v24.08/contribution_guidelines.xhtml).
+Contributions to the Compute Library are more than welcome. If you are interested on contributing, please have a look at our [how to contribute guidelines](https://artificial-intelligence.sites.arm.com/computelibrary/v24.08.1/contribution_guidelines.xhtml).
 
 ### Developer Certificate of Origin (DCO)
 Before the Compute Library accepts your contribution, you need to certify its origin and give us your permission. To manage this process we use the Developer Certificate of Origin (DCO) V1.1 (https://developercertificate.org/)
diff --git a/SConscript b/SConscript
index 65e6d4f53a..bd8f034c9c 100644
--- a/SConscript
+++ b/SConscript
@@ -33,8 +33,8 @@ import codecs
 import platform
 import SCons
 
-VERSION = "v24.08"
-LIBRARY_VERSION_MAJOR = 40
+VERSION = "v24.08.1"
+LIBRARY_VERSION_MAJOR = 41
 LIBRARY_VERSION_MINOR = 0
 LIBRARY_VERSION_PATCH = 0
 SONAME_VERSION = str(LIBRARY_VERSION_MAJOR) + "." + str(LIBRARY_VERSION_MINOR) + "." + str(LIBRARY_VERSION_PATCH)
diff --git a/arm_compute/runtime/experimental/operators/CpuActivation.h b/arm_compute/runtime/experimental/operators/CpuActivation.h
index a255ae1b8b..e869f44b1e 100644
--- a/arm_compute/runtime/experimental/operators/CpuActivation.h
+++ b/arm_compute/runtime/experimental/operators/CpuActivation.h
@@ -38,7 +38,7 @@ namespace op
 /** Wrapper class for CpuActivation. For information on the functions,
  * see "src/cpu/operators/CpuActivation.h"
 */
-class CpuActivation : INEOperator
+class CpuActivation : public INEOperator
 {
 public:
     /** Constructor **/
diff --git a/arm_compute/runtime/experimental/operators/CpuAdd.h b/arm_compute/runtime/experimental/operators/CpuAdd.h
index 065cae72ba..45fd230156 100644
--- a/arm_compute/runtime/experimental/operators/CpuAdd.h
+++ b/arm_compute/runtime/experimental/operators/CpuAdd.h
@@ -39,7 +39,7 @@ namespace op
 /** Wrapper class for CpuAdd. For information on the functions,
  * see "src/cpu/operators/CpuAdd.h"
 */
-class CpuAdd : INEOperator
+class CpuAdd : public INEOperator
 {
 public:
     /** Constructor */
diff --git a/arm_compute/runtime/experimental/operators/CpuDepthwiseConv2d.h b/arm_compute/runtime/experimental/operators/CpuDepthwiseConv2d.h
index 942abc985c..febb11d08f 100644
--- a/arm_compute/runtime/experimental/operators/CpuDepthwiseConv2d.h
+++ b/arm_compute/runtime/experimental/operators/CpuDepthwiseConv2d.h
@@ -41,7 +41,7 @@ namespace op
  * Any new features should be added to arm_compute::cpu::CpuDepthwiseConv2d and
  * arm_compute::experimental::op::CpuDepthwiseConv2d should remain a shallow wrapper.
 */
-class CpuDepthwiseConv2d : IOperator
+class CpuDepthwiseConv2d : public IOperator
 {
 public:
     /** Constructor **/
@@ -55,7 +55,7 @@ class CpuDepthwiseConv2d : IOperator
     /** Default move assignment */
     CpuDepthwiseConv2d &operator=(CpuDepthwiseConv2d &&) = default;
     /** Default destructor */
-    ~CpuDepthwiseConv2d();
+    ~CpuDepthwiseConv2d() override;
 
     /** Initialize the function's source, destination, weights and convolution information.
      *
diff --git a/arm_compute/runtime/experimental/operators/CpuElementwise.h b/arm_compute/runtime/experimental/operators/CpuElementwise.h
index 26d8fb23d3..695a1a20bc 100644
--- a/arm_compute/runtime/experimental/operators/CpuElementwise.h
+++ b/arm_compute/runtime/experimental/operators/CpuElementwise.h
@@ -41,7 +41,7 @@ namespace op
 /** Wrapper class for CpuElementwiseDivision. For information on the functions,
  * see "src/cpu/operators/CpuElementwise.h"
 */
-class CpuElementwiseDivision : INEOperator
+class CpuElementwiseDivision : public INEOperator
 {
 public:
     /** Constructor */
@@ -81,7 +81,7 @@ class CpuElementwiseDivision : INEOperator
 /** Wrapper class for CpuElementwiseMax. For information on the functions,
  * see "src/cpu/operators/CpuElementwise.h"
 */
-class CpuElementwiseMax : INEOperator
+class CpuElementwiseMax : public INEOperator
 {
 public:
     /** Constructor */
@@ -121,7 +121,7 @@ class CpuElementwiseMax : INEOperator
 /** Wrapper class for CpuElementwiseMin. For information on the functions,
  * see "src/cpu/operators/CpuElementwise.h"
 */
-class CpuElementwiseMin : INEOperator
+class CpuElementwiseMin : public INEOperator
 {
 public:
     /** Constructor */
diff --git a/arm_compute/runtime/experimental/operators/CpuGemm.h b/arm_compute/runtime/experimental/operators/CpuGemm.h
index 6072d2e907..ff402bd747 100644
--- a/arm_compute/runtime/experimental/operators/CpuGemm.h
+++ b/arm_compute/runtime/experimental/operators/CpuGemm.h
@@ -45,7 +45,7 @@ namespace op
 /** Wrapper class for CpuGemm. For information on the operators,
  * see "src/cpu/operators/CpuGemm.h"
 */
-class CpuGemm : IOperator
+class CpuGemm : public IOperator
 {
 public:
     /** Constructor **/
diff --git a/arm_compute/runtime/experimental/operators/CpuGemmConv2d.h b/arm_compute/runtime/experimental/operators/CpuGemmConv2d.h
index 89749e3f25..2bbc7148d5 100644
--- a/arm_compute/runtime/experimental/operators/CpuGemmConv2d.h
+++ b/arm_compute/runtime/experimental/operators/CpuGemmConv2d.h
@@ -42,7 +42,7 @@ namespace op
  * Any new features should be added to arm_compute::cpu::CpuGemmConv2d and
  * arm_compute::experimental::op::CpuGemmConv2d should remain a shallow wrapper.
 */
-class CpuGemmConv2d : IOperator
+class CpuGemmConv2d : public IOperator
 {
 public:
     /** Constructor */
@@ -135,7 +135,7 @@ class CpuGemmConv2d : IOperator
                                const WeightsInfo         &weights_info     = WeightsInfo(),
                                const Size2D              &dilation         = Size2D(1U, 1U),
                                const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
-                               const bool                 enable_fast_math = false);
+                               bool                       enable_fast_math = false);
 
     void                             run(ITensorPack &tensors) override;
     void                             prepare(ITensorPack &tensors) override;
diff --git a/arm_compute/runtime/experimental/operators/CpuGemmDirectConv2d.h b/arm_compute/runtime/experimental/operators/CpuGemmDirectConv2d.h
index 9dc37242b0..e4112c3b4a 100644
--- a/arm_compute/runtime/experimental/operators/CpuGemmDirectConv2d.h
+++ b/arm_compute/runtime/experimental/operators/CpuGemmDirectConv2d.h
@@ -41,7 +41,7 @@ namespace op
  * Any new features should be added to arm_compute::cpu::CpuGemmDirectConv2d and
  * arm_compute::experimental::op::CpuGemmDirectConv2d should remain a shallow wrapper.
 */
-class CpuGemmDirectConv2d : IOperator
+class CpuGemmDirectConv2d : public IOperator
 {
 public:
     /** Constructor **/
diff --git a/arm_compute/runtime/experimental/operators/CpuMul.h b/arm_compute/runtime/experimental/operators/CpuMul.h
index be58b77f6c..d5ef33d08b 100644
--- a/arm_compute/runtime/experimental/operators/CpuMul.h
+++ b/arm_compute/runtime/experimental/operators/CpuMul.h
@@ -39,7 +39,7 @@ namespace op
 /** Wrapper class for CpuMul. For information on the functions,
  * see "src/cpu/operators/CpuMul.h"
 */
-class CpuMul : INEOperator
+class CpuMul : public INEOperator
 {
 public:
     /** Constructor */
diff --git a/arm_compute/runtime/experimental/operators/CpuSub.h b/arm_compute/runtime/experimental/operators/CpuSub.h
index 42d2d2ccd3..a68960e31b 100644
--- a/arm_compute/runtime/experimental/operators/CpuSub.h
+++ b/arm_compute/runtime/experimental/operators/CpuSub.h
@@ -41,7 +41,7 @@ namespace op
 /** Wrapper class for CpuSub. For information on the functions,
  * see "src/cpu/operators/CpuSub.h"
 */
-class CpuSub : INEOperator
+class CpuSub : public INEOperator
 {
 public:
     /** Constructor */
diff --git a/arm_compute/runtime/experimental/operators/CpuTranspose.h b/arm_compute/runtime/experimental/operators/CpuTranspose.h
index ec38d39299..d60fe5bdee 100644
--- a/arm_compute/runtime/experimental/operators/CpuTranspose.h
+++ b/arm_compute/runtime/experimental/operators/CpuTranspose.h
@@ -38,7 +38,7 @@ namespace op
 /** Wrapper class for CpuTranspose. For information on the functions,
  * see "src/cpu/operators/CpuTranspose.h"
 */
-class CpuTranspose : INEOperator
+class CpuTranspose : public INEOperator
 {
 public:
     /** Constructor **/
diff --git a/arm_compute/runtime/experimental/operators/CpuWinogradConv2d.h b/arm_compute/runtime/experimental/operators/CpuWinogradConv2d.h
index 534169f0ab..26cbfcdcaa 100644
--- a/arm_compute/runtime/experimental/operators/CpuWinogradConv2d.h
+++ b/arm_compute/runtime/experimental/operators/CpuWinogradConv2d.h
@@ -40,7 +40,7 @@ namespace op
  * Any new features should be added to arm_compute::cpu::CpuWinogradConv2d and
  * arm_compute::experimental::op::CpuWinogradConv2d should remain a shallow wrapper.
 */
-class CpuWinogradConv2d : IOperator
+class CpuWinogradConv2d : public IOperator
 {
 public:
     /** Constructors */
@@ -55,7 +55,7 @@ class CpuWinogradConv2d : IOperator
     CpuWinogradConv2d &operator=(CpuWinogradConv2d &&) = default;
 
     /** Destructor */
-    ~CpuWinogradConv2d();
+    ~CpuWinogradConv2d() override;
 
     /** Set the input and output tensors.
      *
diff --git a/docs/Doxyfile b/docs/Doxyfile
index 0e5dee2382..57f15d0a78 100644
--- a/docs/Doxyfile
+++ b/docs/Doxyfile
@@ -60,7 +60,7 @@ PROJECT_NAME           = "Compute Library"
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
 
-PROJECT_NUMBER         = 24.08
+PROJECT_NUMBER         = 24.08.1
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp
index fa034e812c..7c155d1677 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp
@@ -292,7 +292,7 @@ class GemmHybridIndirect : public GemmCommon<To, Tw, Tr> {
     // Array of pointers to output rows
 //    Tr * const *        _output_ptrs;
 
-    const NDRange<4> _window_range;
+    NDRange<4> _window_range;
 
     unsigned int get_col_sum_size() const {
         if (std::is_same<OutputStage, Requantize32>::value) {
@@ -850,6 +850,18 @@ class GemmHybridIndirect : public GemmCommon<To, Tw, Tr> {
             qp->minval = re.minval;
             qp->maxval = re.maxval;
             _n_block = compute_n_block(_args, _os);
+
+            // Also update the window range because computation of n_block may change wrt B's offset
+            NDRange<4> window_range(iceildiv(_args._Msize, strategy::out_height()), _args._nbatches,
+                              iceildiv(_args._Nsize, _n_block), _args._nmulti);
+
+            // The updated window range should be propagated to kernel execution window
+            // after this method has been called. Otherwise, the window set up at configure time
+            // of the associated kernel will remain.
+            //
+            // See Fallback::update_quantization_parameters() in src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
+            // for how this is done.
+            _window_range = window_range;
         }
     }
 };
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized_inline.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized_inline.hpp
index 44a75f726b..605d58df8f 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized_inline.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized_inline.hpp
@@ -281,7 +281,6 @@ class GemmHybridQuantizedInline : public GemmCommon<To, Tr> {
             qp->per_channel_muls = re.per_channel_muls;
             qp->minval = re.minval;
             qp->maxval = re.maxval;
-            _n_block = compute_n_block(_args, _os);
         }
     }
 };
diff --git a/src/core/NEON/kernels/convolution/winograd/input_transforms/a64_fp16_6x6.cpp b/src/core/NEON/kernels/convolution/winograd/input_transforms/a64_fp16_6x6.cpp
index ad759b225e..2a593f1d9d 100644
--- a/src/core/NEON/kernels/convolution/winograd/input_transforms/a64_fp16_6x6.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/input_transforms/a64_fp16_6x6.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,14 +30,9 @@ namespace arm_conv {
 namespace winograd {
 namespace input_transform {
 
-void a64_fp16_6x6(
-    const unsigned int n_channels,
-    const __fp16* const input_base,
-    const size_t input_row_stride,
-    const size_t input_col_stride,
-    __fp16* outptr,
-    const size_t matrix_stride
-)
+void a64_fp16_6x6(unsigned int n_channels, const __fp16 * input_base,
+        size_t input_row_stride, size_t input_col_stride,
+        __fp16 * outptr, size_t matrix_stride)
 {
     constexpr int inner_tile_rows = 6;
     constexpr int inner_tile_cols = 6;
diff --git a/src/core/NEON/kernels/convolution/winograd/output_transforms/a64_fp16_4x4_3x3.cpp b/src/core/NEON/kernels/convolution/winograd/output_transforms/a64_fp16_4x4_3x3.cpp
index 4218b754b4..3ab0f17907 100644
--- a/src/core/NEON/kernels/convolution/winograd/output_transforms/a64_fp16_4x4_3x3.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/output_transforms/a64_fp16_4x4_3x3.cpp
@@ -31,17 +31,9 @@ namespace arm_conv {
 namespace winograd {
 namespace output_transform {
 
-void a64_fp16_4x4_3x3(
-    unsigned int n_channels,
-    const __fp16* inptr,
-    const size_t matrix_stride,
-    const __fp16* bptr,
-    __fp16* const output,
-    const size_t output_row_stride,
-    const size_t output_col_stride,
-    const __fp16 output_min,
-    const __fp16 output_max
-)
+void a64_fp16_4x4_3x3(unsigned int n_channels,
+        const __fp16 * inptr, size_t matrix_stride, const __fp16 * bptr, __fp16 *output,
+        size_t output_row_stride, size_t output_col_stride, __fp16 output_min, __fp16 output_max)
 {
     constexpr int output_tile_rows = 4, output_tile_cols = 4;
 
diff --git a/src/core/NEON/kernels/convolution/winograd/weight_transforms/a64_fp16_4x4_3x3.cpp b/src/core/NEON/kernels/convolution/winograd/weight_transforms/a64_fp16_4x4_3x3.cpp
index 0d9a65890e..61de6eb5bb 100644
--- a/src/core/NEON/kernels/convolution/winograd/weight_transforms/a64_fp16_4x4_3x3.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/weight_transforms/a64_fp16_4x4_3x3.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,7 +22,6 @@
  * SOFTWARE.
  */
 #if defined(__aarch64__) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-
 #include <cstddef>
 #include <arm_neon.h>
 
@@ -30,14 +29,9 @@ namespace arm_conv {
 namespace winograd {
 namespace weight_transform {
 
-void a64_fp16_4x4_3x3(
-    unsigned int n_channels,
-    const __fp16* inptr,  // NOTE: Data in HWIO order
-    const size_t ld_weight_row,
-    const size_t ld_weight_col,
-    __fp16* outptr,
-    const size_t matrix_stride
-)
+void a64_fp16_4x4_3x3(unsigned int n_channels, const __fp16 * inptr,
+                      size_t ld_weight_row, size_t ld_weight_col, __fp16 * outptr,
+                      size_t matrix_stride)
 {
 #ifdef __aarch64__
     for (; n_channels >= 8; n_channels -= 8)
diff --git a/src/core/NEON/kernels/convolution/winograd/weight_transforms_fp16.cpp b/src/core/NEON/kernels/convolution/winograd/weight_transforms_fp16.cpp
index aebb9497e2..ab85aade3c 100644
--- a/src/core/NEON/kernels/convolution/winograd/weight_transforms_fp16.cpp
+++ b/src/core/NEON/kernels/convolution/winograd/weight_transforms_fp16.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022, 2024 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -31,7 +31,7 @@ namespace arm_conv {
 namespace winograd {
 namespace weight_transform {
 
-void *a64_fp16_4x4_3x3(unsigned int, const __fp16 *, size_t, size_t, __fp16 *, size_t);
+void a64_fp16_4x4_3x3(unsigned int, const __fp16 *, size_t, size_t, __fp16 *, size_t);
 
 #define IMPL(KERN_ROWS, KERN_COLS, TRANS_ROWS, TRANS_COLS, KERN) \
   new Transform<__fp16>(#KERN, KERN_ROWS, KERN_COLS, TRANS_ROWS, TRANS_COLS, KERN)
diff --git a/src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h b/src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h
index 72fafca1bb..bdbfb54c22 100644
--- a/src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h
+++ b/src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h
@@ -96,6 +96,15 @@ class CpuGemmAssemblyWrapperKernel final : public INEKernel
         _kernel->execute(ndc_win, ndc_tlc, info.thread_id);
     }
 
+    /** Configure window of the kernel
+     *
+     * @param[in] window Region on which to execute the kernel
+     */
+    void configure_window(const Window &win)
+    {
+        INEKernel::configure(win);
+    }
+
     /** Initialise the kernel's input and output.
      *
      * @param[in] kernel          Pointer to an assembly kernel implementation.
diff --git a/src/cpu/operators/CpuSoftmax.cpp b/src/cpu/operators/CpuSoftmax.cpp
index fecee7d765..15feb7c325 100644
--- a/src/cpu/operators/CpuSoftmax.cpp
+++ b/src/cpu/operators/CpuSoftmax.cpp
@@ -49,7 +49,7 @@ void CpuSoftmaxGeneric::configure(const ITensorInfo *src, ITensorInfo *dst, floa
 {
     // Perform validation step
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(CpuSoftmaxGeneric::validate(src, dst, beta, axis));
+    ARM_COMPUTE_ERROR_THROW_ON(CpuSoftmaxGeneric::validate(src, dst, beta, axis, is_log));
     ARM_COMPUTE_LOG_PARAMS(src, dst, beta, axis);
 
     const unsigned int actual_axis =
diff --git a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
index 881142c374..aef59ffb30 100644
--- a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
+++ b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
@@ -247,6 +247,13 @@ class Fallback : public CpuGemmAssemblyDispatch::IFallback
         }
 
         _gemm_kernel_asm->update_quantization_parameters(gemm_requant_info);
+
+        // After update_quantization_parameters(), window may change, reconfigure it.
+        auto *opt = reinterpret_cast<kernel::CpuGemmAssemblyWrapperKernel<TypeInput, TypeWeight, TypeOutput> *>(
+            _optimised_kernel.get());
+        const Window win = to_window(_gemm_kernel_asm->get_window_size());
+        opt->configure_window(win);
+
         _is_prepared = is_prepared;
     }