cpu: aarch64: Call stateless ACL API from winograd convolution

- Requires ACL v24.08. - Update doc as necessary.
oneapi-src · Aug 22, 2024 · 03db3e4 · 03db3e4
1 parent 2480cb3
commit 03db3e4
Show file tree

Hide file tree

Showing 6 changed files with 135 additions and 161 deletions.
diff --git a/README.md b/README.md
@@ -173,7 +173,7 @@ On a CPU based on Arm AArch64 architecture, oneDNN CPU engine can be built with
 machine learning applications and provides AArch64 optimized implementations
 of core functions. This functionality currently requires that ACL is downloaded
 and built separately. See [Build from Source] section of the Developer Guide for
-details. oneDNN only supports Compute Library versions 24.07 or later.
+details. oneDNN only supports Compute Library versions 24.08 or later.
 
 [Arm Compute Library (ACL)]: https://github.com/arm-software/ComputeLibrary
 

diff --git a/cmake/ACL.cmake b/cmake/ACL.cmake
@@ -31,7 +31,7 @@ endif()
 
 find_package(ACL REQUIRED)
 
-set(ACL_MINIMUM_VERSION "24.07")
+set(ACL_MINIMUM_VERSION "24.08")
 
 if(ACL_FOUND)
     file(GLOB_RECURSE ACL_VERSION_FILE ${ACL_INCLUDE_DIR}/*/arm_compute_version.embed)

diff --git a/src/common/memory_tracking.hpp b/src/common/memory_tracking.hpp
@@ -199,6 +199,8 @@ enum {
     key_conv_gemm_zp_src_comp,
     key_conv_int_dat_in_acc_dt,
     key_conv_padded_bias,
+    key_conv_permuted_inputs,
+    key_conv_permuted_outputs,
     key_conv_permuted_weights,
     key_conv_rtus_space,
     key_conv_store_wsp,
@@ -300,9 +302,11 @@ enum {
     key_softmax_interim_store,
     key_sum_reduction,
     key_sum_srcs_cvt,
+    key_wino_transformed_weights,
     key_wino_U,
     key_wino_V,
     key_wino_M,
+    key_wino_workspace,
     // These two keys should always be the last ones,
     // even though they are not in alphabetical order
     key_nested,

diff --git a/src/cpu/aarch64/acl_convolution_utils.cpp b/src/cpu/aarch64/acl_convolution_utils.cpp
@@ -14,7 +14,7 @@
 * limitations under the License.
 *******************************************************************************/
 
-#include "cpu/aarch64/acl_convolution_utils.hpp"
+#include "acl_convolution_utils.hpp"
 #include "common/convolution_pd.hpp"
 #include "common/utils.hpp"
 #include "oneapi/dnnl/dnnl.h"
@@ -284,55 +284,6 @@ status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md,
     return status::success;
 }
 
-status_t init_conf_wino(acl_conv_conf_t &acp, memory_desc_t &src_md,
-        memory_desc_t &weights_md, memory_desc_t &dst_md,
-        memory_desc_t &bias_md, const convolution_desc_t &cd,
-        const primitive_attr_t &attr) {
-
-    // Under these conditions, fallback to faster GEMM-based convolution
-    // unless the user explicitly specifies Winograd algorithm
-    // clang-format off
-    if (one_of(true, src_md.dims[2] > 112, // ih
-                src_md.dims[3] > 112, // iw
-                src_md.dims[1] < 64, // ic
-                dst_md.dims[1] < 64, // oc
-                dnnl_get_max_threads() > 28)
-            && cd.alg_kind == alg_kind::convolution_auto) {
-        return status::unimplemented;
-    }
-    // clang-format on
-
-    // General Compute Library checks, memory tags are also set there
-    acp.alg_winograd = true;
-    CHECK(acl_init_conf(acp, src_md, weights_md, dst_md, bias_md, cd, attr));
-
-    const bool shape_ok
-            // only unit strides allowed
-            = (acp.padstride_info.stride() == std::pair<uint, uint> {1, 1})
-            // Note: Compute Library supports arbitrary padding for wino kernels
-            // but we only allow small padding to be consistent with oneDNN
-            && (acp.padstride_info.pad().first <= 1) // padding left/right
-            && (acp.padstride_info.pad().second <= 1) // padding top/bottom
-            // only non-dilated convolutions allowed
-            && (acp.dilation_info == arm_compute::Size2D(1, 1));
-
-    ACL_CHECK_SUPPORT(!shape_ok, "shape not supported by winograd kernels");
-
-    // clang-format off
-    // Validate convolution manually to check for return status
-    ACL_CHECK_VALID(arm_compute::NEWinogradConvolutionLayer::validate(
-        &acp.src_tensor_info,
-        &acp.wei_tensor_info,
-        acp.with_bias ? &acp.bia_tensor_info : nullptr,
-        &acp.dst_tensor_info,
-        acp.padstride_info,
-        acp.act_info,
-        true)); // enable_fast_math flag in ACL Winograd
-    // clang-format on
-
-    return status::success;
-}
-
 status_t init_conf_depthwise(acl_conv_conf_t &acp, memory_desc_t &src_md,
         memory_desc_t &weights_md, memory_desc_t &dst_md,
         memory_desc_t &bias_md, const convolution_desc_t &cd,

diff --git a/src/cpu/aarch64/acl_winograd_convolution.cpp b/src/cpu/aarch64/acl_winograd_convolution.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2020-2023 Arm Ltd. and affiliates
+* Copyright 2020-2024 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -14,29 +14,129 @@
 * limitations under the License.
 *******************************************************************************/
 
-#include "cpu/aarch64/acl_winograd_convolution.hpp"
+#include "acl_winograd_convolution.hpp"
+#include "common/memory_tracking.hpp"
+#include "common/utils.hpp"
 
 namespace dnnl {
 namespace impl {
 namespace cpu {
 namespace aarch64 {
 
+namespace {
+// Keys are anonymous. So deduce the type automagically.
+using conv_key_t = decltype(memory_tracking::names::key_gemm_tmp_buffer);
+
+// Map: [slot , key]
+const std::map<int, conv_key_t> wino_conv_keys
+        = {{0, conv_key_t::key_gemm_asm_tmp_buffer},
+                {1, conv_key_t::key_gemm_pretranspose_b},
+                {2, conv_key_t::key_gemm_pretranspose},
+                {3, conv_key_t::key_gemm_interleaved_lhs},
+                {4, conv_key_t::key_gemm_pretransposed_rhs},
+                {5, conv_key_t::key_gemm_transposed_1xwrhs},
+                {6, conv_key_t::key_gemm_tmp_buffer},
+                {7, conv_key_t::key_conv_permuted_outputs},
+                {8, conv_key_t::key_conv_permuted_inputs},
+                {9, conv_key_t::key_wino_workspace},
+                {10, conv_key_t::key_wino_transformed_weights},
+                {11, conv_key_t::key_conv_permuted_weights}};
+} // namespace
+
+status_t acl_wino_convolution_fwd_t::pd_t::init(engine_t *engine) {
+    using namespace data_type;
+    const bool is_fp16_ok = expect_data_types(f16, f16, f16, f16, undef)
+            && attr()->has_default_values(
+                    primitive_attr_t::skip_mask_t::post_ops, f16);
+    const bool is_fp32_ok = expect_data_types(f32, f32, f32, f32, undef)
+            && attr()->has_default_values(
+                    primitive_attr_t::skip_mask_t::post_ops, f32);
+    bool ok = is_fwd()
+            && utils::one_of(desc()->alg_kind, alg_kind::convolution_auto,
+                    alg_kind::convolution_winograd)
+            && utils::one_of(true, is_fp16_ok, is_fp32_ok)
+            && !has_zero_dim_memory();
+
+    ok = ok && DNNL_CPU_THREADING_RUNTIME != DNNL_RUNTIME_THREADPOOL;
+    if (!ok) return status::unimplemented;
+
+    CHECK(init_conf());
+
+    set_default_alg_kind(alg_kind::convolution_winograd);
+
+    Op conv;
+    conv.configure(&acp_.src_tensor_info, &acp_.wei_tensor_info,
+            acp_.with_bias ? &acp_.bia_tensor_info : nullptr,
+            &acp_.dst_tensor_info, acp_.padstride_info, acp_.act_info,
+            true); // to support 5x5, 7x7 filter shapes in addition to 3x3
+
+    auto scratchpad = scratchpad_registry().registrar();
+    const auto aux_mem = conv.workspace();
+    return init_scratchpad(conv, scratchpad, wino_conv_keys, engine, post_ops,
+            attr_.post_ops_, acp_.act_info, acp_.use_dst_acc_for_sum, dst_md_);
+}
+
+status_t acl_wino_convolution_fwd_t::create_resource(
+        engine_t *engine, resource_mapper_t &mapper) const {
+    CHECK(pd()->post_ops.create_resource(engine, mapper));
+    return status::success;
+}
+
+status_t acl_wino_convolution_fwd_t::init(engine_t *engine) {
+    auto acp = pd()->acp_;
+    acl_obj_->conv.configure(&acp.src_tensor_info, &acp.wei_tensor_info,
+            acp.with_bias ? &acp.bia_tensor_info : nullptr,
+            &acp.dst_tensor_info, acp.padstride_info, acp.act_info,
+            true); // to support 5x5, 7x7 filter shapes in addition to 3x3
+
+    acl_obj_->aux_mem_req = acl_obj_->conv.workspace();
+    return status::success;
+}
+
+status_t acl_wino_convolution_fwd_t::pd_t::init_conf() {
+
+    // Under these conditions, fallback to faster GEMM-based convolution
+    // unless the user explicitly specifies Winograd algorithm
+    if (utils::one_of(true, src_md_.dims[2] > 112, // ih
+                src_md_.dims[3] > 112, // iw
+                src_md_.dims[1] < 64, // ic
+                dst_md_.dims[1]<64, // oc
+                        dnnl_get_max_threads()> 28)
+            && desc()->alg_kind == alg_kind::convolution_auto) {
+        return status::unimplemented;
+    }
+
+    // General Compute Library checks, memory tags are also set there
+    acp_.alg_winograd = true;
+    CHECK(acl_convolution_utils::acl_init_conf(
+            acp_, src_md_, weights_md_, dst_md_, bias_md_, *desc(), *attr()));
+
+    const bool shape_ok
+            // only unit strides allowed
+            = (acp_.padstride_info.stride() == std::pair<uint, uint> {1, 1})
+            // Note: Compute Library supports arbitrary padding for wino kernels
+            // but we only allow small padding to be consistent with oneDNN
+            && (acp_.padstride_info.pad().first <= 1) // padding left/right
+            && (acp_.padstride_info.pad().second <= 1) // padding top/bottom
+            // only non-dilated convolutions allowed
+            && (acp_.dilation_info == arm_compute::Size2D(1, 1));
+
+    ACL_CHECK_SUPPORT(!shape_ok, "shape not supported by winograd kernels");
+
+    // Validate convolution manually to check for return status
+    ACL_CHECK_VALID(Op::validate(&acp_.src_tensor_info, &acp_.wei_tensor_info,
+            acp_.with_bias ? &acp_.bia_tensor_info : nullptr,
+            &acp_.dst_tensor_info, acp_.padstride_info, acp_.act_info,
+            true)); // enable_fast_math flag in ACL Winograd
+
+    return status::success;
+}
+
 status_t acl_wino_convolution_fwd_t::execute_forward(
         const exec_ctx_t &ctx) const {
-    // Lock here is needed because resource_mapper does not support
-    // concurrent multithreaded access.
-    std::lock_guard<std::mutex> _lock {this->mtx};
-    // Retrieve primitive resource and configured Compute Library objects
-    auto *acl_resource
-            = ctx.get_resource_mapper()->get<acl_wino_resource_t>(this);
-    acl_obj_t<arm_compute::NEWinogradConvolutionLayer> &acl_wino_obj
-            = acl_resource->get_acl_obj();
-
-    return execute_forward_conv_acl<
-            acl_obj_t<arm_compute::NEWinogradConvolutionLayer>, pd_t, data_t>(
-            ctx, acl_wino_obj, pd());
+    return execute_forward_conv_acl<acl_obj_t<Op>, pd_t, data_t>(
+            ctx, acl_obj_.get(), pd(), wino_conv_keys);
 }
-
 } // namespace aarch64
 } // namespace cpu
 } // namespace impl

diff --git a/src/cpu/aarch64/acl_winograd_convolution.hpp b/src/cpu/aarch64/acl_winograd_convolution.hpp
@@ -19,53 +19,17 @@
 
 #include "cpu/cpu_convolution_pd.hpp"
 
-#include "cpu/aarch64/acl_convolution_utils.hpp"
+#include "acl_convolution_utils.hpp"
+#include "arm_compute/runtime/experimental/operators/CpuWinogradConv2d.h"
 
 namespace dnnl {
 namespace impl {
 namespace cpu {
 namespace aarch64 {
 
-struct acl_wino_resource_t : public resource_t {
-    acl_wino_resource_t()
-        : acl_wino_obj_(utils::make_unique<
-                acl_obj_t<arm_compute::NEWinogradConvolutionLayer>>()) {}
-
-    status_t configure(const acl_conv_conf_t &acp) {
-        if (!acl_wino_obj_) return status::out_of_memory;
-
-        // Init Compute Library tensors based on info from descriptor
-        acl_wino_obj_->src_tensor.allocator()->init(acp.src_tensor_info);
-        acl_wino_obj_->wei_tensor.allocator()->init(acp.wei_tensor_info);
-        acl_wino_obj_->dst_tensor.allocator()->init(acp.dst_tensor_info);
-        acl_wino_obj_->bia_tensor.allocator()->init(acp.bia_tensor_info);
-
-        // clang-format off
-        acl_wino_obj_->conv.configure(
-            &acl_wino_obj_->src_tensor,
-            &acl_wino_obj_->wei_tensor,
-            acp.with_bias ? &acl_wino_obj_->bia_tensor : nullptr,
-            &acl_wino_obj_->dst_tensor,
-            acp.padstride_info,
-            acp.act_info,
-            true); // to support 5x5, 7x7 filter shapes in addition to 3x3
-        // clang-format on
-
-        return status::success;
-    }
-
-    acl_obj_t<arm_compute::NEWinogradConvolutionLayer> &get_acl_obj() const {
-        return *acl_wino_obj_;
-    }
-
-    DNNL_DISALLOW_COPY_AND_ASSIGN(acl_wino_resource_t);
-
-private:
-    std::unique_ptr<acl_obj_t<arm_compute::NEWinogradConvolutionLayer>>
-            acl_wino_obj_;
-}; // acl_wino_resource_t
-
 struct acl_wino_convolution_fwd_t : public primitive_t {
+    using Op = arm_compute::experimental::op::CpuWinogradConv2d;
+
     struct pd_t : public cpu_convolution_fwd_pd_t {
         pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
                 const typename pd_t::base_class *hint_fwd_pd)
@@ -76,66 +40,22 @@ struct acl_wino_convolution_fwd_t : public primitive_t {
         DECLARE_COMMON_PD_T(
                 "wino:acl", acl_wino_convolution_fwd_t, USE_GLOBAL_SCRATCHPAD);
 
-        status_t init(engine_t *engine) {
-            using namespace data_type;
-            const bool is_fp16_ok = expect_data_types(f16, f16, f16, f16, undef)
-                    && attr()->has_default_values(
-                            primitive_attr_t::skip_mask_t::post_ops, f16);
-            const bool is_fp32_ok = expect_data_types(f32, f32, f32, f32, undef)
-                    && attr()->has_default_values(
-                            primitive_attr_t::skip_mask_t::post_ops, f32);
-            bool ok = is_fwd()
-                    && utils::one_of(desc()->alg_kind,
-                            alg_kind::convolution_auto,
-                            alg_kind::convolution_winograd)
-                    && utils::one_of(true, is_fp16_ok, is_fp32_ok)
-                    && !has_zero_dim_memory();
-
-            ok = ok && DNNL_CPU_THREADING_RUNTIME != DNNL_RUNTIME_THREADPOOL;
-            if (!ok) return status::unimplemented;
-
-            CHECK(acl_convolution_utils::init_conf_wino(acp_, src_md_,
-                    weights_md_, dst_md_, bias_md_, *desc(), *attr()));
-
-            set_default_alg_kind(alg_kind::convolution_winograd);
-
-            CHECK(post_ops.init(
-                    engine, attr_.post_ops_, dst_md_, acp_.act_info));
-            acp_.use_dst_acc_for_sum = post_ops.has_sum();
-
-            if (acp_.use_dst_acc_for_sum) {
-                const memory_desc_wrapper dst_d(&dst_md_);
-                auto scratchpad = scratchpad_registry().registrar();
-                scratchpad.book(memory_tracking::names::key_generic_acc,
-                        dst_d.nelems(), dst_d.data_type_size());
-            }
-
-            return status::success;
-        }
+        status_t init(engine_t *engine);
 
         acl_conv_conf_t acp_;
         acl_post_ops_t post_ops;
+
+    private:
+        status_t init_conf();
     };
 
-    acl_wino_convolution_fwd_t(const pd_t *apd) : primitive_t(apd) {}
+    acl_wino_convolution_fwd_t(const pd_t *apd)
+        : primitive_t(apd), acl_obj_(std::make_unique<acl_obj_t<Op>>()) {}
 
     status_t create_resource(
-            engine_t *engine, resource_mapper_t &mapper) const override {
-        if (mapper.has_resource(this)) return status::success;
-
-        auto r = utils::make_unique<acl_wino_resource_t>();
-        if (!r) return status::out_of_memory;
-
-        // Configure the resource based on information from primitive descriptor
-        CHECK(r->configure(pd()->acp_));
-        mapper.add(this, std::move(r));
-
-        CHECK(pd()->post_ops.create_resource(engine, mapper));
-
-        return status::success;
-    }
+            engine_t *engine, resource_mapper_t &mapper) const override;
 
-    ~acl_wino_convolution_fwd_t() {}
+    status_t init(engine_t *engine) override;
 
     typedef typename prec_traits<data_type::f32>::type data_t;
 
@@ -144,10 +64,9 @@ struct acl_wino_convolution_fwd_t : public primitive_t {
     }
 
 private:
-    // To guard the const execute_forward(), the mutex must be 'mutable'
-    mutable std::mutex mtx;
     status_t execute_forward(const exec_ctx_t &ctx) const;
     const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
+    std::unique_ptr<acl_obj_t<Op>> acl_obj_;
 }; // acl_wino_convolution_fwd_t
 
 } // namespace aarch64