From 16d6dd4fdcdcf4c6a51b77764238c2eb97dad5f8 Mon Sep 17 00:00:00 2001
From: Hamza Butt <hamza.butt@arm.com>
Date: Tue, 27 Aug 2024 14:08:58 +0000
Subject: [PATCH] cpu: aarch64: Enable stateless ACL depthwise convolution

- All common conv code is now stateless, so we can delete all legacy code.
- Coindentally fixes #2033 by making the weights constant
---
 src/cpu/aarch64/acl_convolution_utils.cpp     | 19 ----
 src/cpu/aarch64/acl_convolution_utils.hpp     | 90 ++++-------------
 src/cpu/aarch64/acl_depthwise_convolution.cpp | 75 ++++++++++++--
 src/cpu/aarch64/acl_depthwise_convolution.hpp | 99 +++----------------
 4 files changed, 93 insertions(+), 190 deletions(-)
diff --git a/src/cpu/aarch64/acl_convolution_utils.cpp b/src/cpu/aarch64/acl_convolution_utils.cpp
index b3ca0f497e7..15437746069 100644
--- a/src/cpu/aarch64/acl_convolution_utils.cpp
+++ b/src/cpu/aarch64/acl_convolution_utils.cpp
@@ -283,25 +283,6 @@ status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md,
 
     return status::success;
 }
-
-status_t init_conf_depthwise(acl_conv_conf_t &acp, memory_desc_t &src_md,
-        memory_desc_t &weights_md, memory_desc_t &dst_md,
-        memory_desc_t &bias_md, const convolution_desc_t &cd,
-        const primitive_attr_t &attr) {
-    if (weights_md.ndims != 5) return status::unimplemented;
-
-    CHECK(acl_init_conf(acp, src_md, weights_md, dst_md, bias_md, cd, attr));
-
-    ACL_CHECK_VALID(arm_compute::NEDepthwiseConvolutionLayer::validate(
-            &acp.src_tensor_info, &acp.wei_tensor_info,
-            acp.with_bias ? &acp.bia_tensor_info : nullptr,
-            &acp.dst_tensor_info, acp.padstride_info,
-            1, // depth multiplier default value
-            acp.act_info, acp.dilation_info));
-
-    return status::success;
-}
-
 } // namespace acl_convolution_utils
 
 } // namespace aarch64
diff --git a/src/cpu/aarch64/acl_convolution_utils.hpp b/src/cpu/aarch64/acl_convolution_utils.hpp
index 60293e0a9e1..37a3d6c3d98 100644
--- a/src/cpu/aarch64/acl_convolution_utils.hpp
+++ b/src/cpu/aarch64/acl_convolution_utils.hpp
@@ -17,23 +17,20 @@
 #ifndef CPU_AARCH64_ACL_CONVOLUTION_UTILS_HPP
 #define CPU_AARCH64_ACL_CONVOLUTION_UTILS_HPP
 
+#include <map>
+#include "acl_post_ops.hpp"
+#include "acl_utils.hpp"
+#include "arm_compute/runtime/experimental/operators/CpuDepthwiseConv2d.h"
 #include "cpu/cpu_convolution_pd.hpp"
-
-#include "cpu/aarch64/acl_post_ops.hpp"
-#include "cpu/aarch64/acl_utils.hpp"
-
+#include <type_traits>
 namespace dnnl {
 namespace impl {
 namespace cpu {
 namespace aarch64 {
 
-template <typename NEConv>
+template <typename ConvOp>
 struct acl_obj_t {
-    NEConv conv;
-    arm_compute::Tensor src_tensor;
-    arm_compute::Tensor wei_tensor;
-    arm_compute::Tensor bia_tensor;
-    arm_compute::Tensor dst_tensor;
+    ConvOp conv;
     arm_compute::experimental::MemoryRequirements aux_mem_req;
 };
 
@@ -51,6 +48,7 @@ struct acl_conv_conf_t {
     arm_compute::TensorInfo wei_tensor_info;
     arm_compute::TensorInfo bia_tensor_info;
     arm_compute::TensorInfo dst_tensor_info;
+
     arm_compute::PadStrideInfo padstride_info;
     arm_compute::Size2D dilation_info;
     // Additional information about the weights not included in wei_tensor_info
@@ -66,15 +64,6 @@ status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md,
         memory_desc_t &bias_md, const convolution_desc_t &cd,
         const primitive_attr_t &attr);
 
-status_t init_conf_depthwise(acl_conv_conf_t &acp, memory_desc_t &src_md,
-        memory_desc_t &weights_md, memory_desc_t &dst_md,
-        memory_desc_t &bias_md, const convolution_desc_t &cd,
-        const primitive_attr_t &attr);
-
-status_t init_conf_wino(acl_conv_conf_t &acp, memory_desc_t &src_md,
-        memory_desc_t &weights_md, memory_desc_t &dst_md,
-        memory_desc_t &bias_md, const convolution_desc_t &cd,
-        const primitive_attr_t &attr);
 } // namespace acl_convolution_utils
 
 // Keys are anonymous with local linkage. So deduce the type automagically.
@@ -127,7 +116,6 @@ status_t execute_forward_conv_acl(const exec_ctx_t &ctx,
     arm_compute::Tensor dst_tensor;
 
     auto const acp = pd->acp_;
-
     src_tensor.allocator()->init(acp.src_tensor_info);
     wei_tensor.allocator()->init(acp.wei_tensor_info);
     dst_tensor.allocator()->init(acp.dst_tensor_info);
@@ -151,11 +139,15 @@ status_t execute_forward_conv_acl(const exec_ctx_t &ctx,
                 const_cast<bia_data_t *>(bia_base));
     }
 
-    arm_compute::ITensorPack pack
-            = {{arm_compute::TensorType::ACL_SRC_0, &src_tensor},
-                    {arm_compute::TensorType::ACL_SRC_1, &wei_tensor},
-                    {arm_compute::TensorType::ACL_SRC_2, &bia_tensor},
-                    {arm_compute::TensorType::ACL_DST, &dst_tensor}};
+    // Constness of the weight tensor matters for depthwise conv in ACL.
+    // Otherwise, it will package the weights more often than needed, as
+    // it will expect the weights to change within the duration of the run
+    // func.
+    arm_compute::ITensorPack pack;
+    pack.add_tensor(arm_compute::TensorType::ACL_SRC_0, &src_tensor);
+    pack.add_const_tensor(arm_compute::TensorType::ACL_SRC_1, &wei_tensor);
+    pack.add_const_tensor(arm_compute::TensorType::ACL_SRC_2, &bia_tensor);
+    pack.add_tensor(arm_compute::TensorType::ACL_DST, &dst_tensor);
 
     // Get temp workspaces.
     const auto aux_mem = acl_conv_obj->aux_mem_req;
@@ -175,7 +167,6 @@ status_t execute_forward_conv_acl(const exec_ctx_t &ctx,
         }
     }
 
-    acl_conv_obj->conv.prepare(pack);
     acl_conv_obj->conv.run(pack);
 
     void *dst = dst_tensor.buffer();
@@ -184,53 +175,6 @@ status_t execute_forward_conv_acl(const exec_ctx_t &ctx,
     return status::success;
 }
 
-template <typename conv_obj_t, typename conv_pd_t, typename src_data_t,
-        typename wei_data_t = src_data_t, typename dst_data_t = src_data_t,
-        typename bia_data_t = src_data_t>
-status_t execute_forward_conv_acl(
-        const exec_ctx_t &ctx, conv_obj_t &acl_conv_obj, const conv_pd_t *pd) {
-    bool with_bias = pd->acp_.with_bias;
-    bool use_dst_acc_for_sum = pd->acp_.use_dst_acc_for_sum;
-
-    auto src_base = CTX_IN_MEM(const src_data_t *, DNNL_ARG_SRC);
-    auto wei_base = CTX_IN_MEM(const wei_data_t *, DNNL_ARG_WEIGHTS);
-
-    // import_memory() and free() methods do not allocate/free any additional
-    // memory, only acquire/release pointers.
-    acl_conv_obj.src_tensor.allocator()->import_memory(
-            const_cast<src_data_t *>(src_base));
-    acl_conv_obj.wei_tensor.allocator()->import_memory(
-            const_cast<wei_data_t *>(wei_base));
-
-    const auto scratchpad = ctx.get_scratchpad_grantor();
-
-    // If we have an unfused sum post op, put the result in a scratchpad tensor.
-    // Result will be summed to the dst during acl_post_ops.execute
-    auto dst_base = use_dst_acc_for_sum
-            ? scratchpad.get<void>(memory_tracking::names::key_generic_acc)
-            : CTX_OUT_MEM(dst_data_t *, DNNL_ARG_DST);
-    acl_conv_obj.dst_tensor.allocator()->import_memory(dst_base);
-
-    if (with_bias) {
-        auto bia_base = CTX_IN_MEM(const bia_data_t *, DNNL_ARG_BIAS);
-        acl_conv_obj.bia_tensor.allocator()->import_memory(
-                const_cast<bia_data_t *>(bia_base));
-    }
-
-    acl_conv_obj.conv.run();
-
-    acl_conv_obj.src_tensor.allocator()->free();
-    acl_conv_obj.wei_tensor.allocator()->free();
-    if (with_bias) { acl_conv_obj.bia_tensor.allocator()->free(); }
-
-    void *dst = acl_conv_obj.dst_tensor.buffer();
-    pd->post_ops.execute(ctx, dst);
-
-    acl_conv_obj.dst_tensor.allocator()->free();
-
-    return status::success;
-}
-
 } // namespace aarch64
 } // namespace cpu
 } // namespace impl
diff --git a/src/cpu/aarch64/acl_depthwise_convolution.cpp b/src/cpu/aarch64/acl_depthwise_convolution.cpp
index 70ae6bceeab..0bf397edc49 100644
--- a/src/cpu/aarch64/acl_depthwise_convolution.cpp
+++ b/src/cpu/aarch64/acl_depthwise_convolution.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023 Arm Ltd. and affiliates
+* Copyright 2023-2024 Arm Ltd. and affiliates
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -21,21 +21,76 @@ namespace impl {
 namespace cpu {
 namespace aarch64 {
 
+namespace {
+// Keys are anonymous. So deduce the type automagically.
+using conv_key_t = decltype(memory_tracking::names::key_gemm_tmp_buffer);
+
+// Map: [slot , key]
+const std::map<int, conv_key_t> depthwise_conv_keys
+        = {{0, conv_key_t::key_gemm_tmp_buffer},
+                {1, conv_key_t::key_conv_permuted_weights}};
+} // namespace
+
 status_t acl_depthwise_convolution_fwd_t::execute_forward(
         const exec_ctx_t &ctx) const {
-    std::lock_guard<std::mutex> _lock {this->mtx};
+    return execute_forward_conv_acl<acl_obj_t<Op>, pd_t, data_t>(
+            ctx, acl_obj_.get(), pd(), depthwise_conv_keys);
+}
+
+status_t acl_depthwise_convolution_fwd_t::pd_t::init(engine_t *engine) {
+    using namespace data_type;
+
+    const bool is_fp16_ok = expect_data_types(f16, f16, f16, f16, undef)
+            && attr()->has_default_values(
+                    primitive_attr_t::skip_mask_t::post_ops, f16);
+    const bool is_fp32_ok = expect_data_types(f32, f32, f32, f32, undef)
+            && attr()->has_default_values(
+                    primitive_attr_t::skip_mask_t::post_ops, f32);
+    bool ok = is_fwd() && set_default_alg_kind(alg_kind::convolution_direct)
+            && utils::one_of(true, is_fp16_ok, is_fp32_ok)
+            && !has_zero_dim_memory();
+    if (!ok) return status::unimplemented;
+
+    if (weights_md_.ndims != 5) return status::unimplemented;
 
-    auto *acl_resource
-            = ctx.get_resource_mapper()
-                      ->get<acl_depthwise_convolution_resource_t>(this);
-    acl_obj_t<arm_compute::NEDepthwiseConvolutionLayer> &acl_depthwise_obj
-            = acl_resource->get_acl_obj();
+    CHECK(acl_convolution_utils::acl_init_conf(
+            acp_, src_md_, weights_md_, dst_md_, bias_md_, *desc(), *attr()));
 
-    return execute_forward_conv_acl<
-            acl_obj_t<arm_compute::NEDepthwiseConvolutionLayer>, pd_t, data_t>(
-            ctx, acl_depthwise_obj, pd());
+    ACL_CHECK_VALID(Op::validate(&acp_.src_tensor_info, &acp_.wei_tensor_info,
+            acp_.with_bias ? &acp_.bia_tensor_info : nullptr,
+            &acp_.dst_tensor_info, acp_.padstride_info,
+            1, // depth multiplier default value
+            acp_.act_info, acp_.dilation_info));
+
+    Op conv;
+    conv.configure(&acp_.src_tensor_info, &acp_.wei_tensor_info,
+            acp_.with_bias ? &acp_.bia_tensor_info : nullptr,
+            &acp_.dst_tensor_info, acp_.padstride_info,
+            1, // depth multiplier default value
+            acp_.act_info, acp_.dilation_info);
+
+    auto scratchpad = scratchpad_registry().registrar();
+    return init_scratchpad(conv, scratchpad, depthwise_conv_keys, engine,
+            post_ops, attr_.post_ops_, acp_.act_info, acp_.use_dst_acc_for_sum,
+            dst_md_);
 }
 
+status_t acl_depthwise_convolution_fwd_t::create_resource(
+        engine_t *engine, resource_mapper_t &mapper) const {
+    CHECK(pd()->post_ops.create_resource(engine, mapper));
+    return status::success;
+}
+
+status_t acl_depthwise_convolution_fwd_t::init(engine_t *engine) {
+    auto acp_ = pd()->acp_;
+    acl_obj_->conv.configure(&acp_.src_tensor_info, &acp_.wei_tensor_info,
+            acp_.with_bias ? &acp_.bia_tensor_info : nullptr,
+            &acp_.dst_tensor_info, acp_.padstride_info,
+            1, // depth multiplier default value
+            acp_.act_info, acp_.dilation_info);
+    acl_obj_->aux_mem_req = acl_obj_->conv.workspace();
+    return status::success;
+}
 } // namespace aarch64
 } // namespace cpu
 } // namespace impl
diff --git a/src/cpu/aarch64/acl_depthwise_convolution.hpp b/src/cpu/aarch64/acl_depthwise_convolution.hpp
index 4df41a4ef1a..c8503af15ff 100644
--- a/src/cpu/aarch64/acl_depthwise_convolution.hpp
+++ b/src/cpu/aarch64/acl_depthwise_convolution.hpp
@@ -17,7 +17,8 @@
 #ifndef CPU_AARCH64_ACL_DEPTHWISE_CONVOLUTION_HPP
 #define CPU_AARCH64_ACL_DEPTHWISE_CONVOLUTION_HPP
 
-#include "cpu/aarch64/acl_convolution_utils.hpp"
+#include "acl_convolution_utils.hpp"
+#include "arm_compute/runtime/experimental/operators/CpuDepthwiseConv2d.h"
 #include "cpu/cpu_convolution_pd.hpp"
 
 namespace dnnl {
@@ -25,47 +26,10 @@ namespace impl {
 namespace cpu {
 namespace aarch64 {
 
-struct acl_depthwise_convolution_resource_t : public resource_t {
-    acl_depthwise_convolution_resource_t()
-        : acl_obj_(utils::make_unique<
-                acl_obj_t<arm_compute::NEDepthwiseConvolutionLayer>>()) {}
-
-    status_t configure(const acl_conv_conf_t &acp) {
-        if (!acl_obj_) return status::out_of_memory;
-
-        acl_obj_->src_tensor.allocator()->init(acp.src_tensor_info);
-        acl_obj_->wei_tensor.allocator()->init(acp.wei_tensor_info);
-        acl_obj_->dst_tensor.allocator()->init(acp.dst_tensor_info);
-        acl_obj_->bia_tensor.allocator()->init(acp.bia_tensor_info);
-
-        // clang-format off
-        acl_obj_->conv.configure(
-            &acl_obj_->src_tensor,
-            &acl_obj_->wei_tensor,
-            acp.with_bias ? &acl_obj_->bia_tensor : nullptr,
-            &acl_obj_->dst_tensor,
-            acp.padstride_info,
-            1, // depth multiplier default value
-            acp.act_info,
-            acp.dilation_info);
-
-        // clang-format on
-        return status::success;
-    }
-
-    acl_obj_t<arm_compute::NEDepthwiseConvolutionLayer> &get_acl_obj() const {
-        return *acl_obj_;
-    }
-
-    DNNL_DISALLOW_COPY_AND_ASSIGN(acl_depthwise_convolution_resource_t);
-
-private:
-    std::unique_ptr<acl_obj_t<arm_compute::NEDepthwiseConvolutionLayer>>
-            acl_obj_;
-};
-
 struct acl_depthwise_convolution_fwd_t : public primitive_t {
 
+    using Op = arm_compute::experimental::op::CpuDepthwiseConv2d;
+
     struct pd_t : public cpu_convolution_fwd_pd_t {
         pd_t(const convolution_desc_t *adesc, const primitive_attr_t *attr,
                 const typename pd_t::base_class *hint_fwd_pd)
@@ -74,59 +38,17 @@ struct acl_depthwise_convolution_fwd_t : public primitive_t {
         DECLARE_COMMON_PD_T("depthwise_convolution:acl",
                 acl_depthwise_convolution_fwd_t, USE_GLOBAL_SCRATCHPAD);
 
-        status_t init(engine_t *engine) {
-            using namespace data_type;
-
-            const bool is_fp16_ok = expect_data_types(f16, f16, f16, f16, undef)
-                    && attr()->has_default_values(
-                            primitive_attr_t::skip_mask_t::post_ops, f16);
-            const bool is_fp32_ok = expect_data_types(f32, f32, f32, f32, undef)
-                    && attr()->has_default_values(
-                            primitive_attr_t::skip_mask_t::post_ops, f32);
-            bool ok = is_fwd()
-                    && set_default_alg_kind(alg_kind::convolution_direct)
-                    && utils::one_of(true, is_fp16_ok, is_fp32_ok)
-                    && !has_zero_dim_memory();
-            if (!ok) return status::unimplemented;
-
-            CHECK(acl_convolution_utils::init_conf_depthwise(acp_, src_md_,
-                    weights_md_, dst_md_, bias_md_, *desc(), *attr()));
-
-            CHECK(post_ops.init(
-                    engine, attr_.post_ops_, dst_md_, acp_.act_info));
-            acp_.use_dst_acc_for_sum = post_ops.has_sum();
-
-            if (acp_.use_dst_acc_for_sum) {
-                const memory_desc_wrapper dst_d(&dst_md_);
-                auto scratchpad = scratchpad_registry().registrar();
-                scratchpad.book(memory_tracking::names::key_generic_acc,
-                        dst_d.nelems(), dst_d.data_type_size());
-            }
-
-            return status::success;
-        }
+        status_t init(engine_t *engine);
 
         acl_conv_conf_t acp_;
-
         acl_post_ops_t post_ops;
     };
 
-    acl_depthwise_convolution_fwd_t(const pd_t *apd) : primitive_t(apd) {}
+    acl_depthwise_convolution_fwd_t(const pd_t *apd)
+        : primitive_t(apd), acl_obj_(std::make_unique<acl_obj_t<Op>>()) {}
 
     status_t create_resource(
-            engine_t *engine, resource_mapper_t &mapper) const override {
-        if (mapper.has_resource(this)) return status::success;
-
-        auto r = utils::make_unique<acl_depthwise_convolution_resource_t>();
-        if (!r) return status::out_of_memory;
-
-        CHECK(r->configure(pd()->acp_));
-        mapper.add(this, std::move(r));
-
-        CHECK(pd()->post_ops.create_resource(engine, mapper));
-
-        return status::success;
-    }
+            engine_t *engine, resource_mapper_t &mapper) const override;
 
     typedef typename prec_traits<data_type::f32>::type data_t;
 
@@ -134,11 +56,12 @@ struct acl_depthwise_convolution_fwd_t : public primitive_t {
         return execute_forward(ctx);
     }
 
+    status_t init(engine_t *engine) override;
+
 private:
-    mutable std::mutex mtx;
     status_t execute_forward(const exec_ctx_t &ctx) const;
-
     const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
+    std::unique_ptr<acl_obj_t<Op>> acl_obj_;
 };
 
 } // namespace aarch64